|
@@ -89,32 +89,47 @@ def keyin_keyword(driver, keyword):
|
|
|
time.sleep(3)
|
|
|
|
|
|
|
|
|
-def get_crawler_list(db):
|
|
|
+# def get_crawler_list(db):
|
|
|
|
|
|
- result = db.query('select keyword, count(*) from shop_item_list group by keyword')
|
|
|
- result = pd.DataFrame([i for i in result])
|
|
|
- result.columns = ['keyword', 'count']
|
|
|
- result = result[result['count'] < 100]
|
|
|
- keyword = result.sample(1).iloc[0]['keyword']
|
|
|
+# result = db.query('select keyword, count(*) from shop_item_list group by keyword')
|
|
|
+# result = pd.DataFrame([i for i in result])
|
|
|
+# result.columns = ['keyword', 'count']
|
|
|
+# result = result[result['count'] < 100]
|
|
|
+# keyword = result.sample(1).iloc[0]['keyword']
|
|
|
|
|
|
- num=0
|
|
|
- cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
|
|
|
- for c in cursor:
|
|
|
- num=c['num']
|
|
|
- break
|
|
|
-
|
|
|
- cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
|
|
|
- # cursor=db.query('select * from lat_lon_loc')
|
|
|
- lst=[]
|
|
|
- for c in cursor:
|
|
|
- lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
|
|
|
+# num=0
|
|
|
+# cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
|
|
|
+# for c in cursor:
|
|
|
+# num=c['num']
|
|
|
+# break
|
|
|
+
|
|
|
+# cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
|
|
|
+# # cursor=db.query('select * from lat_lon_loc')
|
|
|
+# lst=[]
|
|
|
+# for c in cursor:
|
|
|
+# lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
|
|
|
|
|
|
- return keyword, lst
|
|
|
-
|
|
|
+# return keyword, lst
|
|
|
|
|
|
+
|
|
|
+def get_crawler_list(db):
|
|
|
+ result = db.query('select distinct(keyword) from shop_item_list order by keyword')
|
|
|
+ result = pd.DataFrame([i for i in result])
|
|
|
+ progress = db.query('select distinct(kw) from progress_list2')
|
|
|
+ progress = pd.DataFrame([i for i in progress])
|
|
|
+
|
|
|
+ if len(progress) != 0:
|
|
|
+ keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
|
|
|
+ else:
|
|
|
+ keyword = result.iloc[0].values[0]
|
|
|
+
|
|
|
+ return keyword
|
|
|
+
|
|
|
+
|
|
|
def main():
|
|
|
# data = pd.read_csv('lat_long_location.csv', index_col = 0)
|
|
|
# keyword = '麻辣火鍋'
|
|
|
+ lon_lat = [[121.567,25.038], [121.567,25.046], [121.543,25.046], [121.543,25.038]]
|
|
|
port=4447
|
|
|
if len(sys.argv) > 1 :
|
|
|
port=int(sys.argv[1])
|
|
@@ -128,36 +143,36 @@ def main():
|
|
|
driver = brower_start(port)
|
|
|
# db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
- table=db['shop_item_list']
|
|
|
+ table=db['shop_item_list2']
|
|
|
table2=db['progress_list2']
|
|
|
|
|
|
- keyword, data = get_crawler_list(db)
|
|
|
- print( keyword, len(data))
|
|
|
-
|
|
|
- for row in data:
|
|
|
+ keyword = get_crawler_list(db)
|
|
|
+ print(keyword)
|
|
|
+
|
|
|
+ c = 0
|
|
|
+ for row in lon_lat:
|
|
|
+ c += 1
|
|
|
try:
|
|
|
- latitude = row['lat'] #緯度
|
|
|
- longitude = row['lon'] #精度
|
|
|
- table2.upsert({'kw':keyword,'num':row['num']},['kw'])
|
|
|
+ # latitude = row['lat'] #緯度
|
|
|
+ # longitude = row['lon'] #精度
|
|
|
+ latitude = row[1] #緯度
|
|
|
+ longitude = row[0] #精度
|
|
|
+ # table2.upsert({'kw':keyword,'num':row['num']},['kw'])
|
|
|
+ table2.insert({'kw':keyword,'num':c})
|
|
|
|
|
|
url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
driver.get(url)
|
|
|
keyin_keyword(driver, keyword)
|
|
|
failcnt = 0
|
|
|
- for page in range(4):
|
|
|
+ for page in range(5):
|
|
|
print(keyword, row['loc'], latitude, longitude, page)
|
|
|
url_list = get_url_list(driver)
|
|
|
- # if url_list == 'EMPTY':
|
|
|
- # failcnt+=1
|
|
|
- # if failcnt >=2:
|
|
|
- # break
|
|
|
- # continue
|
|
|
-
|
|
|
duplicate = 0
|
|
|
- shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
+ # shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
for item in url_list:
|
|
|
try:
|
|
|
- table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
+ table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \
|
|
|
+ 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
except:
|
|
|
duplicate += 1
|
|
|
print(len(url_list), duplicate)
|