|
@@ -0,0 +1,217 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+#from selenium import webdriver
|
|
|
+from seleniumwire import webdriver
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+import selenium
|
|
|
+import traceback
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from utility import database_access as DA
|
|
|
+from utility.parseutils import *
|
|
|
+from utility.connect import *
|
|
|
+
|
|
|
+from datetime import datetime
|
|
|
+import pandas as pd
|
|
|
+import dataset
|
|
|
+import time
|
|
|
+import json
|
|
|
+import re
|
|
|
+import sys, os
|
|
|
+import socket
|
|
|
+import brotli
|
|
|
+
|
|
|
+chrome_window=False
|
|
|
+
|
|
|
+def brower_start(port):
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ if chrome_window:
|
|
|
+ browser = webdriver.Chrome(
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ browser = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+
|
|
|
+ return browser
|
|
|
+
|
|
|
+
|
|
|
+def page_down_(driver, xpath_css, time_):
|
|
|
+ e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
|
|
|
+ result_count = e.text.split('-')[1].replace(' 項結果','')
|
|
|
+ print(result_count)
|
|
|
+ if int(result_count) > 5:
|
|
|
+ for i in range(time_):
|
|
|
+ e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
|
|
|
+ action = webdriver.common.action_chains.ActionChains(driver)
|
|
|
+ action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
|
|
|
+ action.click()
|
|
|
+ action.perform()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+def get_url_list(driver):
|
|
|
+ page_down_(driver, '//div[@class="TFQHme"]', 8)
|
|
|
+
|
|
|
+ url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ url_list = []
|
|
|
+ for i in url_soup.find_all('a'):
|
|
|
+ try:
|
|
|
+ if i['href'].find('maps/place') != -1:
|
|
|
+ url_list += [[i['href'], i['aria-label']]]
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ # print(len(url_list))
|
|
|
+ return url_list
|
|
|
+
|
|
|
+
|
|
|
+def keyin_keyword(driver, keyword):
|
|
|
+ button = driver.find_element_by_id("searchbox")
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
|
|
|
+ time.sleep(3)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def get_crawler_list(db):
|
|
|
+# result = db.query('select * from shop_item_list order by keyword')
|
|
|
+# result = pd.DataFrame([i for i in result])
|
|
|
+# result = result[~result.keyword.str.contains('項')]
|
|
|
+
|
|
|
+# progress = db.query('select distinct(kw) from progress_list2 where num < 367')
|
|
|
+# progress = pd.DataFrame([i for i in progress])
|
|
|
+
|
|
|
+# if len(progress) != 0:
|
|
|
+# keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
|
|
|
+# else:
|
|
|
+# keyword = result.iloc[0]['keyword']
|
|
|
+#
|
|
|
+# return keyword
|
|
|
+ return '滷味'
|
|
|
+ cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
|
|
|
+ for c in cursor:
|
|
|
+ return c['kw']
|
|
|
+ return None
|
|
|
+
|
|
|
+def get_lon_lat_list(db, keyword):
|
|
|
+ num=0
|
|
|
+ cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
|
|
|
+ for c in cursor:
|
|
|
+ num=c['num']
|
|
|
+ break
|
|
|
+
|
|
|
+ cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
|
|
|
+
|
|
|
+ lst=[]
|
|
|
+ for c in cursor:
|
|
|
+ lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
|
|
|
+
|
|
|
+ return lst
|
|
|
+
|
|
|
+def write_to_file(jsobj,fname):
|
|
|
+ import codecs
|
|
|
+ fw=codecs.open(fname,'w','utf-8')
|
|
|
+ fw.write(str(jsobj))
|
|
|
+ fw.close()
|
|
|
+
|
|
|
+def parsing_js(orig):
|
|
|
+ content=""
|
|
|
+ lines=orig.split('\n')
|
|
|
+ for l in lines:
|
|
|
+ newl=l.replace('\\"','"')
|
|
|
+# if '\\\\"' in newl:
|
|
|
+# print(newl)
|
|
|
+# newl=newl.repace('\\\\"','')
|
|
|
+ newl=newl.replace('\\"','"')
|
|
|
+
|
|
|
+ content+=newl
|
|
|
+ result=re.search(r'\[\["',content)
|
|
|
+ print(result)
|
|
|
+ content_begin=result.start()
|
|
|
+
|
|
|
+ result=re.search(r'\]\]"',content)
|
|
|
+ print(result)
|
|
|
+
|
|
|
+ content_end=result.end()
|
|
|
+
|
|
|
+ jscontent=content[content_begin:content_end-1]
|
|
|
+ write_to_file(jscontent,'c:/tmp/debug.txt')
|
|
|
+ jsobj=json.loads(jscontent)
|
|
|
+ for x in jsobj[0][1][1:]:
|
|
|
+ print(x[14][11])
|
|
|
+ print(x[14][10])
|
|
|
+ print(x[14][2])
|
|
|
+ print(x[14][78])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ global chrome_window
|
|
|
+ localip=socket.gethostbyname(socket.gethostname())
|
|
|
+ if localip=='192.168.1.108':
|
|
|
+ chrome_window=True
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ table = db['shop_item_list3']
|
|
|
+ table2 = db['progress_list2']
|
|
|
+
|
|
|
+ port=4447
|
|
|
+ if len(sys.argv) > 1 :
|
|
|
+ port=int(sys.argv[1])
|
|
|
+ print('restart docker p{}'.format(port))
|
|
|
+ os.system('sudo docker container restart p'+str(port))
|
|
|
+ time.sleep(8)
|
|
|
+
|
|
|
+ print('drvier start...')
|
|
|
+ driver = brower_start(port)
|
|
|
+
|
|
|
+
|
|
|
+ for i in range(10):
|
|
|
+ try:
|
|
|
+ keyword = get_crawler_list(db)
|
|
|
+ print(keyword)
|
|
|
+ lst = get_lon_lat_list(db, keyword)
|
|
|
+# print(lst)
|
|
|
+ print(keyword, len(lst))
|
|
|
+
|
|
|
+ for r in lst:
|
|
|
+ latitude = r['lat'] #緯度
|
|
|
+ longitude = r['lon'] #精度
|
|
|
+ area_num=r['num']
|
|
|
+ table2.upsert({'kw':keyword,'num':r['num']},['kw'])
|
|
|
+
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
+ driver.get(url)
|
|
|
+ keyin_keyword(driver, keyword)
|
|
|
+ failcnt = 0
|
|
|
+
|
|
|
+# query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
|
|
|
+ time.sleep(11)
|
|
|
+ print("ppppppppp&**********************")
|
|
|
+ for request in driver.requests:
|
|
|
+ if request.response:
|
|
|
+ if 'https://www.google.com.tw/search?tbm=map' in request.url :
|
|
|
+ print('parsing js:')
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext=resp.decode('utf-8')
|
|
|
+ parsing_js(jstext)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ for page in range(10):
|
|
|
+ if page < 2 :
|
|
|
+ element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ if element.get_attribute('disabled'):
|
|
|
+ break
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|