|
@@ -113,16 +113,15 @@ def get_intro_info(driver, output):
|
|
|
break
|
|
|
|
|
|
intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
-
|
|
|
for key in intro_list:
|
|
|
elements = intro_soup.find('div',{'aria-label':key})
|
|
|
if elements:
|
|
|
element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
|
|
|
- # print(element)
|
|
|
count = 0
|
|
|
tmp = []
|
|
|
for ele in element:
|
|
|
- if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
|
|
|
+ # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
|
|
|
+ if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
|
|
|
tmp += [{
|
|
|
'id':count,
|
|
|
intro_list[key][1]: blank_check(ele.text)
|
|
@@ -208,7 +207,7 @@ def get_reviews(driver, output):
|
|
|
element = driver.find_element_by_css_selector(more_reviews_css)
|
|
|
driver.implicitly_wait(20)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- time.sleep(2)
|
|
|
+ time.sleep(1)
|
|
|
|
|
|
all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
|
|
|
for ap in all_photo:
|
|
@@ -279,7 +278,7 @@ def find_photo_list(driver):
|
|
|
actions.move_to_element(element).perform()
|
|
|
except:
|
|
|
break
|
|
|
-
|
|
|
+ time.sleep(1)
|
|
|
photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
photo_url = []
|
|
|
for photo_id in count_list:
|
|
@@ -354,7 +353,10 @@ def data_select_insert(db, table_name, table_col, data):
|
|
|
for name_ in table_col:
|
|
|
if name_ == 'crawler_date':
|
|
|
continue
|
|
|
- tmp += [data[name_]]
|
|
|
+ if name_ == 'lon' or name_ == 'lat':
|
|
|
+ tmp += [float(data[name_])]
|
|
|
+ else:
|
|
|
+ tmp += [data[name_]]
|
|
|
|
|
|
tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
|
|
|
|
|
@@ -381,28 +383,65 @@ def time_click(driver):
|
|
|
return status
|
|
|
|
|
|
|
|
|
-def main():
|
|
|
- # driver = serive_create('Profile 1')
|
|
|
- db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
+def get_not_cralwer_url(keyword):
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
- table = db['shop_item_list2']
|
|
|
- url_list = list(table.find(keyword='咖啡'))
|
|
|
- shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format('咖啡'))]
|
|
|
+ table = db['shop_item_list']
|
|
|
+ url_list = list(table.find(keyword=keyword))
|
|
|
+ shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
|
|
|
+ error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
|
|
|
+
|
|
|
url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
|
|
|
url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
- url_pd = url_pd[url_pd['item_url_length']!=600]
|
|
|
- url_list = url_pd[~url_pd['item_url'].isin(shop_item)]['item_url']
|
|
|
+ url_pd = url_pd[url_pd['item_url_length']!=1000]
|
|
|
+ url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
|
|
|
+ url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
|
|
|
|
|
|
+ print('have {} URL list'.format(len(url_pd)))
|
|
|
# url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
|
|
|
+
|
|
|
+ return url_pd
|
|
|
+
|
|
|
+
|
|
|
+def serive_create_linux(profilepath):
|
|
|
+ option = webdriver.ChromeOptions()
|
|
|
+ option.add_argument('--headless')
|
|
|
+ option.add_argument('--no-sandbox')
|
|
|
+ option.add_argument('--disable-web-security')
|
|
|
+ option.add_argument('--allow-running-insecure-content')
|
|
|
+ option.add_argument('--incognito')
|
|
|
+ option.add_argument(
|
|
|
+ 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
|
|
|
+ # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
|
|
|
+ option.add_argument(
|
|
|
+ "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
|
|
|
+ option.add_argument("profile-directory="+profilepath)
|
|
|
+ driver = webdriver.Chrome('utility/chromedriver', options=option)
|
|
|
+ # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
|
|
|
+ # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
|
|
|
+
|
|
|
+ executor_url = driver.command_executor._url
|
|
|
+ session_id = driver.session_id
|
|
|
+ print(session_id)
|
|
|
+ print(executor_url)
|
|
|
+
|
|
|
+ return driver
|
|
|
+
|
|
|
+def main():
|
|
|
+ keyword = '咖啡'
|
|
|
+ db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
+ url_pd = get_not_cralwer_url(keyword)
|
|
|
+
|
|
|
print('drvier start...')
|
|
|
driver = brower_start()
|
|
|
+
|
|
|
+ # driver = serive_create('Profile 1')
|
|
|
+ # profilepath = 'Profile 1'
|
|
|
+ # driver = serive_create_linux(profilepath)
|
|
|
|
|
|
- try:
|
|
|
- for key, row in url_list.iterrows():
|
|
|
+ for key, row in url_pd.iterrows():
|
|
|
+ try:
|
|
|
name = row['name']
|
|
|
item_url = row['item_url']
|
|
|
- # result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
|
|
|
- # if len(result) != 0: continue
|
|
|
print(key, name, ': ' ,item_url)
|
|
|
|
|
|
driver.get(item_url)
|
|
@@ -413,7 +452,7 @@ def main():
|
|
|
time.sleep(0.5)
|
|
|
print('start...')
|
|
|
time_status = time_click(driver)
|
|
|
- time.sleep(1)
|
|
|
+ time.sleep(0.5)
|
|
|
shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
|
|
|
output = {
|
|
@@ -435,15 +474,16 @@ def main():
|
|
|
query_name = '{}+{}'.format(output_name, output['addr'])
|
|
|
query_name = query_name.replace(' ','')
|
|
|
output['item_url'] = item_url
|
|
|
+ output['keyword'] = keyword
|
|
|
output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
- time.sleep(1)
|
|
|
data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
|
|
|
- except:
|
|
|
- error = pd.DataFrame([row])
|
|
|
- error.to_csv('error.csv', mode='a', header = False)
|
|
|
- driver.close()
|
|
|
- driver = brower_start()
|
|
|
+ except:
|
|
|
+ error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
+ data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
+ driver.close()
|
|
|
+ driver = brower_start()
|
|
|
+ # driver = serive_create_linux(profilepath)
|
|
|
|
|
|
|
|
|
|