|
@@ -28,10 +28,10 @@ def serive_create(profilepath):
|
|
|
|
|
|
option.add_argument('--disable-web-security')
|
|
|
option.add_argument('--allow-running-insecure-content')
|
|
|
- option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
|
|
|
+ #option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
|
|
|
option.add_argument("profile-directory="+profilepath)
|
|
|
|
|
|
- driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
|
|
|
+ driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
|
|
|
executor_url = driver.command_executor._url
|
|
|
session_id = driver.session_id
|
|
|
print (session_id)
|
|
@@ -45,7 +45,7 @@ def brower_start(port):
|
|
|
# browser = webdriver.Chrome(options=options)
|
|
|
|
|
|
browser = webdriver.Remote(
|
|
|
- command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
desired_capabilities=options.to_capabilities()
|
|
|
)
|
|
|
return browser
|
|
@@ -115,38 +115,44 @@ def get_shop_info(driver, output, shop_soup):
|
|
|
|
|
|
def get_intro_info(driver, output):
|
|
|
# element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
|
|
|
- element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
|
|
|
- driver.implicitly_wait(10)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ try:
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
|
|
- # pageSource = driver.page_source
|
|
|
- # fileToWrite = open("page_source.html", "w")
|
|
|
- # fileToWrite.write(pageSource)
|
|
|
- # fileToWrite.close()
|
|
|
-
|
|
|
- page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
|
|
|
-
|
|
|
- intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
- for key in intro_list:
|
|
|
- elements = intro_soup.find('div',{'aria-label':key})
|
|
|
- if elements:
|
|
|
- element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
|
|
|
- count = 0
|
|
|
- tmp = []
|
|
|
- for ele in element:
|
|
|
- # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
|
|
|
- if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
|
|
|
- tmp += [{
|
|
|
- 'id':count,
|
|
|
- intro_list[key][1]: blank_check(ele.text)
|
|
|
- }]
|
|
|
- count += 1
|
|
|
- print(str(tmp))
|
|
|
- output[intro_list[key][0]] = str(tmp)
|
|
|
- else:
|
|
|
+ # pageSource = driver.page_source
|
|
|
+ # fileToWrite = open("page_source.html", "w")
|
|
|
+ # fileToWrite.write(pageSource)
|
|
|
+ # fileToWrite.close()
|
|
|
+
|
|
|
+ page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
|
|
|
+
|
|
|
+ intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ for key in intro_list:
|
|
|
+ elements = intro_soup.find('div',{'aria-label':key})
|
|
|
+ if elements:
|
|
|
+ element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
|
|
|
+ count = 0
|
|
|
+ tmp = []
|
|
|
+ for ele in element:
|
|
|
+ # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
|
|
|
+ if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
|
|
|
+ tmp += [{
|
|
|
+ 'id':count,
|
|
|
+ intro_list[key][1]: blank_check(ele.text)
|
|
|
+ }]
|
|
|
+ count += 1
|
|
|
+ print(str(tmp))
|
|
|
+ output[intro_list[key][0]] = str(tmp)
|
|
|
+ else:
|
|
|
+ output[intro_list[key][0]] = '[]'
|
|
|
+ driver.back()
|
|
|
+ return output
|
|
|
+
|
|
|
+ except:
|
|
|
+ for key in intro_list:
|
|
|
output[intro_list[key][0]] = '[]'
|
|
|
- driver.back()
|
|
|
- return output
|
|
|
+ return output
|
|
|
|
|
|
|
|
|
def get_time_list(shop_soup, output):
|
|
@@ -475,12 +481,12 @@ def main():
|
|
|
url_pd = get_not_cralwer_url(keyword)
|
|
|
print('drvier start {}...'.format(keyword))
|
|
|
driver = brower_start(port)
|
|
|
- # driver = serive_create('Profile 1')
|
|
|
+ #driver = serive_create('Profile 6')
|
|
|
#profilepath = 'Profile 1'
|
|
|
#driver = serive_create_linux(profilepath)
|
|
|
|
|
|
for key, row in url_pd.iterrows():
|
|
|
- try:
|
|
|
+ # try:
|
|
|
name = row['name']
|
|
|
item_url = row['item_url']
|
|
|
print(key, name, ': ' ,item_url)
|
|
@@ -525,11 +531,11 @@ def main():
|
|
|
output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
- error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
- data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
- time.sleep(2)
|
|
|
+ # except Exception as e:
|
|
|
+ # print(e)
|
|
|
+ # error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
+ # data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
+ # time.sleep(2)
|
|
|
# driver.close()
|
|
|
# driver = brower_start(port)
|
|
|
# driver = serive_create_linux(profilepath)
|