|
@@ -5,7 +5,8 @@ from selenium.webdriver.common.keys import Keys
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
from selenium.webdriver.common.by import By
|
|
|
-
|
|
|
+import selenium
|
|
|
+import traceback
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
from utility import database_access as DA
|
|
@@ -18,7 +19,7 @@ import dataset
|
|
|
import time
|
|
|
import json
|
|
|
import re
|
|
|
-import sys
|
|
|
+import sys, os
|
|
|
|
|
|
def brower_start(port):
|
|
|
options = webdriver.ChromeOptions()
|
|
@@ -59,6 +60,14 @@ def get_url_list(driver):
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
page_down_(driver, '//div[@class="TFQHme"]', 8)
|
|
|
|
|
|
url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
@@ -69,7 +78,7 @@ def get_url_list(driver):
|
|
|
url_list += [[i['href'], i['aria-label']]]
|
|
|
except:
|
|
|
pass
|
|
|
- print(len(url_list))
|
|
|
+
|
|
|
return url_list
|
|
|
|
|
|
|
|
@@ -85,8 +94,8 @@ def get_crawler_list(db):
|
|
|
result = db.query('select keyword, count(*) from shop_item_list group by keyword')
|
|
|
result = pd.DataFrame([i for i in result])
|
|
|
result.columns = ['keyword', 'count']
|
|
|
- results = results[results['count'] < 500]
|
|
|
- keyword = results.sample(1).iloc[0]['keyword']
|
|
|
+ result = result[result['count'] < 100]
|
|
|
+ keyword = result.sample(1).iloc[0]['keyword']
|
|
|
|
|
|
num=0
|
|
|
cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
|
|
@@ -106,9 +115,12 @@ def get_crawler_list(db):
|
|
|
def main():
|
|
|
|
|
|
|
|
|
- port=4444
|
|
|
+ port=4447
|
|
|
if len(sys.argv) > 1 :
|
|
|
- port=sys.argv[1]
|
|
|
+ port=int(sys.argv[1])
|
|
|
+ print('restart docker p{}'.format(port))
|
|
|
+ os.system('sudo docker container restart p'+str(port))
|
|
|
+ time.sleep(8)
|
|
|
|
|
|
|
|
|
|
|
@@ -119,41 +131,48 @@ def main():
|
|
|
table=db['shop_item_list']
|
|
|
table2=db['progress_list2']
|
|
|
|
|
|
- data, keyword = get_crawler_list(db)
|
|
|
+ keyword, data = get_crawler_list(db)
|
|
|
print( keyword, len(data))
|
|
|
|
|
|
for row in data:
|
|
|
-
|
|
|
- latitude = row['lat']
|
|
|
- longitude = row['lon']
|
|
|
- table2.upsert({'kw':keyword,'num':r['num']},['kw'])
|
|
|
-
|
|
|
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
- driver.get(url)
|
|
|
- keyin_keyword(driver, keyword)
|
|
|
-
|
|
|
- for page in range(4):
|
|
|
- print(keyword, row['loc'], latitude, longitude, page)
|
|
|
- url_list = get_url_list(driver)
|
|
|
-
|
|
|
- shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
- for item in url_list:
|
|
|
- try:
|
|
|
- table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
- except:
|
|
|
- print('dup entry')
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- if page < 2 :
|
|
|
- element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
-
|
|
|
+ try:
|
|
|
+ latitude = row['lat']
|
|
|
+ longitude = row['lon']
|
|
|
+ table2.upsert({'kw':keyword,'num':row['num']},['kw'])
|
|
|
+
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
+ driver.get(url)
|
|
|
+ keyin_keyword(driver, keyword)
|
|
|
+ failcnt = 0
|
|
|
+ for page in range(4):
|
|
|
+ print(keyword, row['loc'], latitude, longitude, page)
|
|
|
+ url_list = get_url_list(driver)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ duplicate = 0
|
|
|
+ shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
+ for item in url_list:
|
|
|
+ try:
|
|
|
+ table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
+ except:
|
|
|
+ duplicate += 1
|
|
|
+ print(len(url_list), duplicate)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if page < 2 :
|
|
|
+ element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ except:
|
|
|
+ pass
|
|
|
|
|
|
|
|
|
|