|
@@ -14,6 +14,7 @@ from utility.connect import *
|
|
|
|
|
|
from datetime import datetime
|
|
|
import pandas as pd
|
|
|
+import dataset
|
|
|
import time
|
|
|
import json
|
|
|
import re
|
|
@@ -58,7 +59,7 @@ def get_url_list(driver):
|
|
|
# time.sleep(0.5)
|
|
|
# except:
|
|
|
# pass
|
|
|
- page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 8)
|
|
|
+ page_down_(driver, '//div[@class="TFQHme"]', 8)
|
|
|
|
|
|
url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
url_list = []
|
|
@@ -79,47 +80,82 @@ def keyin_keyword(driver, keyword):
|
|
|
time.sleep(3)
|
|
|
|
|
|
|
|
|
+def get_crawler_list(db):
|
|
|
+
|
|
|
+ result = db.query('select keyword, count(*) from shop_item_list group by keyword')
|
|
|
+ result = pd.DataFrame([i for i in result])
|
|
|
+ result.columns = ['keyword', 'count']
|
|
|
+ results = results[results['count'] < 500]
|
|
|
+ keyword = results.sample(1).iloc[0]['keyword']
|
|
|
+
|
|
|
+ num=0
|
|
|
+ cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
|
|
|
+ for c in cursor:
|
|
|
+ num=c['num']
|
|
|
+ break
|
|
|
+
|
|
|
+ cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
|
|
|
+ # cursor=db.query('select * from lat_lon_loc')
|
|
|
+ lst=[]
|
|
|
+ for c in cursor:
|
|
|
+ lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
|
|
|
+
|
|
|
+ return keyword, lst
|
|
|
+
|
|
|
+
|
|
|
def main():
|
|
|
- data = pd.read_csv('lat_long_location.csv', index_col = 0)
|
|
|
-
|
|
|
- keyword = '麻辣火鍋'
|
|
|
- if len(sys.argv) >1:
|
|
|
- keyword=sys.argv[1]
|
|
|
+# data = pd.read_csv('lat_long_location.csv', index_col = 0)
|
|
|
+# keyword = '麻辣火鍋'
|
|
|
port=4444
|
|
|
- if len(sys.argv) >2:
|
|
|
- port=int(sys.argv[2])
|
|
|
+ if len(sys.argv) > 1 :
|
|
|
+ port=sys.argv[1]
|
|
|
+# if len(sys.argv) >2:
|
|
|
+# port=int(sys.argv[2])
|
|
|
|
|
|
print('drvier start...')
|
|
|
driver = brower_start(port)
|
|
|
- db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
-
|
|
|
- for k, row in data.iterrows():
|
|
|
- try:
|
|
|
- latitude = row['latitude'] #緯度
|
|
|
- longitude = row['longitude'] #精度
|
|
|
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
- driver.get(url)
|
|
|
- keyin_keyword(driver, keyword)
|
|
|
-
|
|
|
- for page in range(4):
|
|
|
- print(keyword, k, row['location'], latitude, longitude, page)
|
|
|
- url_list = get_url_list(driver)
|
|
|
-
|
|
|
- shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
- for item in url_list:
|
|
|
- result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
|
|
|
- insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
|
|
|
- .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
|
|
|
-
|
|
|
- DA.mysql_insert_data(db, insert_sql)
|
|
|
-
|
|
|
- if page < 2 :
|
|
|
- element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- except:
|
|
|
- error = pd.DataFrame([row])
|
|
|
- error.to_csv('error_shop_item_list.csv', mode='a', header = False)
|
|
|
+# db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ table=db['shop_item_list']
|
|
|
+ table2=db['progress_list2']
|
|
|
+
|
|
|
+ data, keyword = get_crawler_list(db)
|
|
|
+ print( keyword, len(data))
|
|
|
+
|
|
|
+ for row in data:
|
|
|
+# try:
|
|
|
+ latitude = row['lat'] #緯度
|
|
|
+ longitude = row['lon'] #精度
|
|
|
+ table2.upsert({'kw':keyword,'num':r['num']},['kw'])
|
|
|
+
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
+ driver.get(url)
|
|
|
+ keyin_keyword(driver, keyword)
|
|
|
+
|
|
|
+ for page in range(4):
|
|
|
+ print(keyword, row['loc'], latitude, longitude, page)
|
|
|
+ url_list = get_url_list(driver)
|
|
|
+
|
|
|
+ shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
+ for item in url_list:
|
|
|
+ try:
|
|
|
+ table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
+ except:
|
|
|
+ print('dup entry')
|
|
|
+
|
|
|
+# result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
|
|
|
+# insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
|
|
|
+# .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
|
|
|
+
|
|
|
+# DA.mysql_insert_data(db, insert_sql)
|
|
|
+
|
|
|
+ if page < 2 :
|
|
|
+ element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+# except:
|
|
|
+# error = pd.DataFrame([row])
|
|
|
+# error.to_csv('error_shop_item_list.csv', mode='a', header = False)
|
|
|
#driver.close()
|
|
|
#driver = brower_start()
|
|
|
|