123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217 |
- # -*- coding: utf-8 -*-
- #from selenium import webdriver
- from seleniumwire import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- import selenium
- import traceback
- from bs4 import BeautifulSoup
- from utility import database_access as DA
- from utility.parseutils import *
- from utility.connect import *
- from datetime import datetime
- import pandas as pd
- import dataset
- import time
- import json
- import re
- import sys, os
- import socket
- import brotli
- chrome_window=False
- def brower_start(port):
- options = webdriver.ChromeOptions()
- if chrome_window:
- browser = webdriver.Chrome(
- desired_capabilities=options.to_capabilities()
- )
- else:
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def page_down_(driver, xpath_css, time_):
- e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
- result_count = e.text.split('-')[1].replace(' 項結果','')
- print(result_count)
- if int(result_count) > 5:
- for i in range(time_):
- e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
- action = webdriver.common.action_chains.ActionChains(driver)
- action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
- action.click()
- action.perform()
- time.sleep(0.5)
- def get_url_list(driver):
- page_down_(driver, '//div[@class="TFQHme"]', 8)
- url_soup = BeautifulSoup(driver.page_source, 'html.parser')
- url_list = []
- for i in url_soup.find_all('a'):
- try:
- if i['href'].find('maps/place') != -1:
- url_list += [[i['href'], i['aria-label']]]
- except:
- pass
- # print(len(url_list))
- return url_list
- def keyin_keyword(driver, keyword):
- button = driver.find_element_by_id("searchbox")
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
- time.sleep(3)
- def get_crawler_list(db):
- # result = db.query('select * from shop_item_list order by keyword')
- # result = pd.DataFrame([i for i in result])
- # result = result[~result.keyword.str.contains('項')]
- # progress = db.query('select distinct(kw) from progress_list2 where num < 367')
- # progress = pd.DataFrame([i for i in progress])
- # if len(progress) != 0:
- # keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
- # else:
- # keyword = result.iloc[0]['keyword']
- #
- # return keyword
- return '滷味'
- cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
- for c in cursor:
- return c['kw']
- return None
- def get_lon_lat_list(db, keyword):
- num=0
- cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
- for c in cursor:
- num=c['num']
- break
- cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
- lst=[]
- for c in cursor:
- lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
- return lst
- def write_to_file(jsobj,fname):
- import codecs
- fw=codecs.open(fname,'w','utf-8')
- fw.write(str(jsobj))
- fw.close()
- def parsing_js(orig):
- content=""
- lines=orig.split('\n')
- for l in lines:
- newl=l.replace('\\"','"')
- # if '\\\\"' in newl:
- # print(newl)
- # newl=newl.repace('\\\\"','')
- newl=newl.replace('\\"','"')
- content+=newl
- result=re.search(r'\[\["',content)
- print(result)
- content_begin=result.start()
- result=re.search(r'\]\]"',content)
- print(result)
- content_end=result.end()
- jscontent=content[content_begin:content_end-1]
- write_to_file(jscontent,'c:/tmp/debug.txt')
- jsobj=json.loads(jscontent)
- for x in jsobj[0][1][1:]:
- print(x[14][11])
- print(x[14][10])
- print(x[14][2])
- print(x[14][78])
- def main():
- global chrome_window
- localip=socket.gethostbyname(socket.gethostname())
- if localip=='192.168.1.108':
- chrome_window=True
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
- table = db['shop_item_list3']
- table2 = db['progress_list2']
- port=4447
- if len(sys.argv) > 1 :
- port=int(sys.argv[1])
- print('restart docker p{}'.format(port))
- os.system('sudo docker container restart p'+str(port))
- time.sleep(8)
- print('drvier start...')
- driver = brower_start(port)
-
- for i in range(10):
- try:
- keyword = get_crawler_list(db)
- print(keyword)
- lst = get_lon_lat_list(db, keyword)
- # print(lst)
- print(keyword, len(lst))
- for r in lst:
- latitude = r['lat'] #緯度
- longitude = r['lon'] #精度
- area_num=r['num']
- table2.upsert({'kw':keyword,'num':r['num']},['kw'])
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
- driver.get(url)
- keyin_keyword(driver, keyword)
- failcnt = 0
- # query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
- time.sleep(11)
- print("ppppppppp&**********************")
- for request in driver.requests:
- if request.response:
- if 'https://www.google.com.tw/search?tbm=map' in request.url :
- print('parsing js:')
- resp = brotli.decompress(request.response.body)
- jstext=resp.decode('utf-8')
- parsing_js(jstext)
- for page in range(10):
- if page < 2 :
- element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
- if element.get_attribute('disabled'):
- break
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(element).click(element).perform()
- except:
- pass
- if __name__ == '__main__':
- main()
|