123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399 |
- # -*- coding: utf-8 -*-
- from seleniumwire import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- import selenium
- import traceback
- from bs4 import BeautifulSoup
- from utility import database_access as DA
- from utility.parseutils import *
- from utility.connect import *
- from datetime import datetime
- import pandas as pd
- import dataset
- import requests, random, time, json
- import re, sys, os
- import socket, brotli
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import urllib.parse
- chrome_window=False
- #chrome_window=True
- globalkw=None
- proxyport=8787
- def build_cache(db):
- id_dict={}
- cursor = db.query('SELECT place_id FROM {}.{};'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_STORE_LIST))
- for c in cursor:
- id_dict[c['place_id']]=1
- return id_dict
- def brower_start(port):
- global proxyport
- global chrome_window
- print(proxyport)
- options = webdriver.ChromeOptions()
- if chrome_window:
- browser = webdriver.Chrome(
- desired_capabilities=options.to_capabilities()
- )
- else:
- chrome_options = webdriver.ChromeOptions()
- chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
- chrome_options.add_argument('--ignore-certificate-errors')
- chrome_options.add_argument("--no-sandbox")
- chrome_options.add_argument("--disable-dev-shm-usage")
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=chrome_options.to_capabilities(),
- seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
- )
- # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
- browser.set_window_size(1400,1000)
- return browser
- def keyin_keyword(driver, keyword):
- button = driver.find_element_by_id("searchbox")
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
- time.sleep(3)
- def scan_job(db, kw):
- result = {'kw' : kw}
- table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'],MYSQL_CONFIG['TABLE_CONV_LOG'])
- cursor = db.query('select t1.num,next-prev as diff from {} t1, \
- (SELECT num,max(id) mid FROM {} group by num ) t2 \
- where t1.id=t2.mid having diff>0 order by rand()'.format(table_name, table_name))
- for c in cursor:
- result['num']=c['num']
- break
- cursor = db.query('select lat,lon,loc from {} where num ="'.format(TABLE_LAT_LON)+str(result['num'])+'"')
- for c in cursor:
- result['lat'] = c['lat']
- result['lon'] = c['lon']
- result['loc'] = c['loc']
- return result
- def get_next_job(db, repeat=False, repkw=None, repnum=None):
- global globalkw
- result={}
- cursor = db.query('select kw, num from {} where expand = 0 order by rand()'.format(TABLE_AREACODES))
- for c in cursor:
- if repkw is None:
- repkw = c['kw']
- result['kw'] = c['kw']
- result['num'] = c['num']
- break
- if repkw is not None:
- result['kw'] = repkw
- if result.get('num') is not None:
- cursor = db.query('select lat,lon,loc from {} where num ="{}"'.format(TABLE_LAT_LON, str(result['num'])))
- for c in cursor:
- result['lat']=c['lat']
- result['lon']=c['lon']
- result['loc']=c['loc']
- break
- if repeat and repkw!= 'REP':
- result['kw']=repkw
- result['num']=repnum
- if 'REP' in repkw:
- if repnum=='REP':
- repnum=None
- cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
- for c in cursor:
- repnum=c['num']
- break
- if repnum is None:
- cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
- for c in cursor:
- repnum=c['num']
- break
- cursor = db.query('select lat_txt,lon_txt,keyword,num from {} where num="{}" limit 1'.format(TABLE_STORE_LIST, str(repnum)))
- for c in cursor:
- result['kw']=c['keyword']
- result['num']=c['num']
- result['lat']=c['lat_txt']
- result['lon']=c['lon_txt']
- result['loc']=''
- return result
- if repeat:
- cursor = db.query('select lat_txt,lon_txt,keyword from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
- for c in cursor:
- result['kw']=c['keyword']
- result['lat']=c['lat_txt']
- result['lon']=c['lon_txt']
- return result
- def write_to_file(jsobj,fname):
- import codecs
- fw=codecs.open(fname,'w','utf-8')
- fw.write(str(jsobj))
- fw.close()
- def parsing_js(orig):
- resultobj=[]
- content=""
- lines=orig.split('\n')
- for l in lines:
- newl=l.replace('\\"','"')
- newl=newl.replace('\\"','"')
- content+=newl
- result=re.search(r'\[\["',content)
- print(result)
- content_begin=result.start()
- result=re.search(r'\]\]"',content)
- print(result)
- content_end=result.end()
- jscontent=content[content_begin:content_end-1]
- # write_to_file(jscontent,'c:/tmp/debug.txt')
- jsobj=json.loads(jscontent)
- for x in jsobj[0][1][1:]:
- print(x[14][11])
- print(x[14][9])
- reviews_cnt=None
- photo=None
- rating=None
- biz_id=None
- loc_x=None
- loc_y=None
- addr_elmts=None
- tel=None
- try:
- rating=x[14][4][7]
- reviews_cnt=x[14][4][8]
- except:
- traceback.print_exc()
- try:
- photo=x[14][37][0][0][0]
- num_photos=x[14][37][0][0][6][1]
- except:
- traceback.print_exc()
- try:
- loc_x=x[14][37][0][0][29][0]
- loc_y=x[14][37][0][0][29][1]
- except:
- traceback.print_exc()
- try:
- biz_id=x[14][57][2]
- tel=x[14][178][0][3]
- except:
- traceback.print_exc()
- try:
- addr_elmts=str(x[14][82])
- except:
- traceback.print_exc()
- category=str(x[14][13])
- topic=str(x[14][89])
- print(x[14][13])
- print(x[14][10])
- print(x[14][2])
- print(x[14][78])
- try:
- resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
- except:
- traceback.print_exc()
- return resultobj
- def save_js_to_db(jsobj,num,keyword):
- global store_list_table
- global iddict
- for r in jsobj:
- if iddict.get(r['place_id']) is not None:
- continue
- r['num']=num
- r['keyword']=keyword
- try:
- store_list_table.insert(r)
- except:
- traceback.print_exc()
- def process_web_request(db, driver, area_num, keyword):
- global prev_cnt
- request_url = None
- time.sleep(0.8)
- time.sleep(3)
- print("ppppppppp&**********************")
- for request in driver.requests:
- if 'search?' in request.url :
- print('searching.....')
- if request.response:
- if 'search?' in request.url :
- print('parsing js:')
- print(request.url)
- resp = brotli.decompress(request.response.body)
- jstext = resp.decode('utf-8')
- resultobj = parsing_js(jstext)
- print("before",datetime.now())
- print("num: "+str(area_num))
- save_js_to_db(resultobj, area_num, keyword)
- print("after",datetime.now())
- aft_cnt=0
- cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
- for c in cursor:
- aft_cnt=c['cnt']
- break
- db[TABLE_CONV_LOG].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
- del driver.requests
- def check_area_code(db, kw):
- if kw:
- table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_AREACODES)
- result = db.query('select distinct(kw) from {}'.format(table_name))
- result = [i['kw'] for i in result]
- if kw not in result:
- try:
- sql = 'insert into {} (select num,"{}" as kw, 0 as expand from {}) '.format(table_name, kw, TABLE_LAT_LON)
- db.query(sql)
- except:
- traceback.print_exc()
- def page_down_(driver, time_):
- try:
- # action = webdriver.ActionChains(driver)
- # element = driver.find_element_by_css_selector('a[aria-label="清除搜尋"]')
- # print(element)
- # height = element.size['height']
- # width = element.size['width']
- # action.move_to_element(element).move_by_offset(-width, height).click().perform()
- action = webdriver.ActionChains(driver)
- element = driver.find_element_by_css_selector('div[class="TFQHme"]')
- action.move_to_element(element).click().perform()
- time.sleep(1)
- driver.back()
- time.sleep(1)
- for i in range(time_):
- print(i)
- actions = ActionChains(driver)
- actions.send_keys(Keys.END).perform()
-
- time.sleep(0.5)
- except:
- traceback.print_exc()
- def main():
- global chrome_window
- global store_list_table
- global globalkw
- global proxyport
- global iddict
- global prev_cnt
- port=4447
- if len(sys.argv)>1:
- globalkw=sys.argv[1]
- port=int(sys.argv[2])
- proxyport=int(sys.argv[3])
- print(globalkw, port, proxyport)
- failcnt=0
- localip=socket.gethostbyname(socket.gethostname())
- db = dataset.connect('mysql://{}:{}@{}/{}?charset=utf8mb4'.format( MYSQL_CONFIG['MYSQL_USER'],
- MYSQL_CONFIG['MYSQL_PASSWORD'], MYSQL_CONFIG['MYSQL_HOST'], MYSQL_CONFIG['MYSQL_DB']))
- store_list_table = db[TABLE_STORE_LIST]
- table2 = db[TABLE_PROGRESS_LIST]
- if not chrome_window:
- print('restart docker pw{}'.format(port))
- os.system('sudo docker container restart pw'+str(port))
- # os.system('docker container restart p'+str(port))
- time.sleep(10)
- print('drvier start...')
- driver = brower_start(port)
- check_area_code(db, globalkw)
- for i in range(368):
- area_num=None
- # if len(sys.argv) > 4 :
- # repkw = sys.argv[1]
- # repnum = sys.argv[2]
- # if 'SCAN' in repkw:
- # job = scan_job(db, repnum)
- # else:
- # job = get_next_job(db, repeat=True, repkw=repkw, repnum=repnum)
- # else:
- job = get_next_job(db, repkw=globalkw)
- print(job)
- keyword = job['kw']
- globalkw = keyword
- latitude = job['lat'] #緯度
- longitude = job['lon'] #精度
- area_num = job['num']
- safe_string = urllib.parse.quote_plus(keyword)
- for j in range(5):
- iddict = build_cache(db)
- if j != 0:
- latitude_ = float(latitude) + (random.randint(-999,999) / 10000)
- longitude_ = float(longitude) + (random.randint(-999,999) / 10000)
- else:
- latitude_, longitude_ = latitude, longitude
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude_, longitude_)
- print(url)
- prev_cnt=0
- cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
- for c in cursor:
- prev_cnt = c['cnt']
- break
- driver.get(url)
- time.sleep(2)
- keyin_keyword(driver, keyword)
- # page_down_(driver, 3)
- process_web_request(db, driver, area_num, keyword)
- time.sleep(1)
- table2.insert({'kw':keyword,'num':job['num']},['kw'])
- db.query(f'update {TABLE_AREACODES} set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
- if __name__ == '__main__':
- main()
|