|
@@ -32,21 +32,33 @@ def brower_start(port):
|
|
|
|
|
|
|
|
|
def page_down_(driver, xpath_css, time_):
|
|
|
- elmts = driver.find_elements_by_xpath(xpath_css)
|
|
|
- print(elmts)
|
|
|
- if len(elmts)>1:
|
|
|
- elmt=elmts[1]
|
|
|
- else:
|
|
|
- elmt=elmts[0]
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.move_to_element(elmt).click().perform()
|
|
|
- for i in range(time_):
|
|
|
- try:
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.send_keys(Keys.PAGE_DOWN).perform()
|
|
|
- except:
|
|
|
- traceback.print_exc()
|
|
|
- time.sleep(0.5)
|
|
|
+ e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
|
|
|
+ result_count = e.text.split('-')[1].replace(' 項結果','')
|
|
|
+ print(result_count)
|
|
|
+ if int(result_count) > 5:
|
|
|
+ for i in range(time_):
|
|
|
+ e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
|
|
|
+ action = webdriver.common.action_chains.ActionChains(driver)
|
|
|
+ action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
|
|
|
+ action.click()
|
|
|
+ action.perform()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ # elmts = driver.find_elements_by_xpath(xpath_css)
|
|
|
+ # print(elmts)
|
|
|
+ # if len(elmts)>1:
|
|
|
+ # elmt=elmts[1]
|
|
|
+ # else:
|
|
|
+ # elmt=elmts[0]
|
|
|
+ # actions = ActionChains(driver)
|
|
|
+ # actions.move_to_element(elmt).click().perform()
|
|
|
+ # for i in range(time_):
|
|
|
+ # try:
|
|
|
+ # actions = ActionChains(driver)
|
|
|
+ # actions.send_keys(Keys.PAGE_DOWN).perform()
|
|
|
+ # except:
|
|
|
+ # traceback.print_exc()
|
|
|
+ # time.sleep(0.5)
|
|
|
|
|
|
|
|
|
def get_url_list(driver):
|
|
@@ -113,59 +125,70 @@ def keyin_keyword(driver, keyword):
|
|
|
|
|
|
|
|
|
def get_crawler_list(db):
|
|
|
- result = db.query('select distinct(keyword) from shop_item_list order by keyword')
|
|
|
+ result = db.query('select * from shop_item_list order by keyword')
|
|
|
result = pd.DataFrame([i for i in result])
|
|
|
- progress = db.query('select distinct(kw) from progress_list2')
|
|
|
+ result = result[~result.keyword.str.contains('項')]
|
|
|
+
|
|
|
+ progress = db.query('select distinct(kw) from progress_list2 where num < 367')
|
|
|
progress = pd.DataFrame([i for i in progress])
|
|
|
|
|
|
if len(progress) != 0:
|
|
|
- keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
|
|
|
+ keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
|
|
|
else:
|
|
|
- keyword = result.iloc[0].values[0]
|
|
|
+ keyword = result.iloc[0]['keyword']
|
|
|
|
|
|
return keyword
|
|
|
|
|
|
|
|
|
+def get_lon_lat_list(db, keyword):
|
|
|
+ num=0
|
|
|
+ cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
|
|
|
+ for c in cursor:
|
|
|
+ num=c['num']
|
|
|
+ break
|
|
|
+
|
|
|
+ cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
|
|
|
+
|
|
|
+ lst=[]
|
|
|
+ for c in cursor:
|
|
|
+ lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
|
|
|
+
|
|
|
+ return lst
|
|
|
+
|
|
|
+
|
|
|
def main():
|
|
|
-# data = pd.read_csv('lat_long_location.csv', index_col = 0)
|
|
|
-# keyword = '麻辣火鍋'
|
|
|
- lon_lat = [[121.567,25.038], [121.567,25.046], [121.543,25.046], [121.543,25.038]]
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ table = db['shop_item_list3']
|
|
|
+ table2 = db['progress_list2']
|
|
|
+
|
|
|
port=4447
|
|
|
if len(sys.argv) > 1 :
|
|
|
port=int(sys.argv[1])
|
|
|
print('restart docker p{}'.format(port))
|
|
|
- os.system('sudo docker container restart pp'+str(port))
|
|
|
+ os.system('sudo docker container restart p'+str(port))
|
|
|
time.sleep(8)
|
|
|
|
|
|
-# if len(sys.argv) >2:
|
|
|
-# port=int(sys.argv[2])
|
|
|
print('drvier start...')
|
|
|
driver = brower_start(port)
|
|
|
- # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
- table=db['shop_item_list3']
|
|
|
- table2=db['progress_list2']
|
|
|
+
|
|
|
|
|
|
- for i in range(20):
|
|
|
+ for i in range(10):
|
|
|
try:
|
|
|
keyword = get_crawler_list(db)
|
|
|
print(keyword)
|
|
|
+ lst = get_lon_lat_list(db, keyword)
|
|
|
+ print(keyword, len(lst))
|
|
|
|
|
|
- c = 0
|
|
|
- for row in lon_lat:
|
|
|
- c += 1
|
|
|
- # latitude = row['lat'] #緯度
|
|
|
- # longitude = row['lon'] #精度
|
|
|
- latitude = row[1] #緯度
|
|
|
- longitude = row[0] #精度
|
|
|
- # table2.upsert({'kw':keyword,'num':row['num']},['kw'])
|
|
|
- table2.insert({'kw':keyword,'num':c})
|
|
|
+ for r in lst:
|
|
|
+ latitude = r['lat'] #緯度
|
|
|
+ longitude = r['lon'] #精度
|
|
|
+ table2.upsert({'kw':keyword,'num':r['num']},['kw'])
|
|
|
|
|
|
url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
driver.get(url)
|
|
|
keyin_keyword(driver, keyword)
|
|
|
failcnt = 0
|
|
|
- for page in range(5):
|
|
|
+ for page in range(10):
|
|
|
print(keyword, latitude, longitude, page)
|
|
|
url_list = get_url_list(driver)
|
|
|
duplicate = 0
|
|
@@ -173,7 +196,7 @@ def main():
|
|
|
for item in url_list:
|
|
|
try:
|
|
|
table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \
|
|
|
- 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
+ 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
except:
|
|
|
duplicate += 1
|
|
|
print(len(url_list), duplicate)
|
|
@@ -185,6 +208,8 @@ def main():
|
|
|
|
|
|
if page < 2 :
|
|
|
element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ if element.get_attribute('disabled'):
|
|
|
+ break
|
|
|
driver.implicitly_wait(30)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
except:
|