@@ -0,0 +1,393 @@
+# -*- coding: utf-8 -*-
+from seleniumwire import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+import selenium
+import traceback
+from bs4 import BeautifulSoup
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+from datetime import datetime
+import pandas as pd
+import dataset
+import requests
+import time
+import json
+import re
+import sys, os
+import socket
+import brotli
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import urllib.parse
+def build_cache(db):
+ id_dict={}
+ cursor = db.query('SELECT place_id FROM {}.{};'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_STORE_LIST))
+ for c in cursor:
+ id_dict[c['place_id']]=1
+ return id_dict
+def brower_start(port):
+ global proxyport
+ global chrome_window
+ print(proxyport)
+ options = webdriver.ChromeOptions()
+ if chrome_window:
+ browser = webdriver.Chrome(
+ desired_capabilities=options.to_capabilities()
+ )
+ else:
+ chrome_options = webdriver.ChromeOptions()
+ chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
+ chrome_options.add_argument('--ignore-certificate-errors')
+ chrome_options.add_argument("--no-sandbox")
+ chrome_options.add_argument("--disable-dev-shm-usage")
+ browser = webdriver.Remote(
+ command_executor=''+str(port)+'/wd/hub',
+ desired_capabilities=chrome_options.to_capabilities(),
+ seleniumwire_options={'addr':'','port':proxyport,'auto_config': False}
+ )
+# seleniumwire_options = {'addr': '','port':4444})
+ browser.set_window_size(1400,1000)
+ return browser
+def keyin_keyword(driver, keyword):
+ button = driver.find_element_by_id("searchbox")
+ driver.implicitly_wait(30)
+ ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
+ time.sleep(3)
+def scan_job(db, kw):
+ result = {'kw' : kw}
+ table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'],MYSQL_CONFIG['TABLE_CONV_LOG'])
+ cursor = db.query('select t1.num,next-prev as diff from {} t1, \
+ (SELECT num,max(id) mid FROM {} group by num ) t2 \
+ where t1.id=t2.mid having diff>0 order by rand()'.format(table_name, table_name))
+ for c in cursor:
+ result['num']=c['num']
+ break
+ cursor = db.query('select lat,lon,loc from {} where num ="'.format(TABLE_LAT_LON)+str(result['num'])+'"')
+ for c in cursor:
+ result['lat'] = c['lat']
+ result['lon'] = c['lon']
+ result['loc'] = c['loc']
+ return result
+def get_next_job(db, repeat=False, repkw=None, repnum=None):
+ global globalkw
+ result={}
+ cursor = db.query('select kw, num from {} where expand = 0 order by rand()'.format(TABLE_AREACODES))
+ for c in cursor:
+ if repkw is None:
+ repkw = c['kw']
+ result['kw'] = c['kw']
+ result['num'] = c['num']
+ break
+ if repkw is not None:
+ result['kw'] = repkw
+ if result.get('num') is not None:
+ cursor = db.query('select lat,lon,loc from {} where num ="{}"'.format(TABLE_LAT_LON, str(result['num'])))
+ for c in cursor:
+ result['lat']=c['lat']
+ result['lon']=c['lon']
+ result['loc']=c['loc']
+ break
+ if repeat and repkw!= 'REP':
+ result['kw']=repkw
+ result['num']=repnum
+ if 'REP' in repkw:
+ if repnum=='REP':
+ repnum=None
+ cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
+ for c in cursor:
+ repnum=c['num']
+ break
+ if repnum is None:
+ cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
+ for c in cursor:
+ repnum=c['num']
+ break
+ cursor = db.query('select lat_txt,lon_txt,keyword,num from {} where num="{}" limit 1'.format(TABLE_STORE_LIST, str(repnum)))
+ for c in cursor:
+ result['kw']=c['keyword']
+ result['num']=c['num']
+ result['lat']=c['lat_txt']
+ result['lon']=c['lon_txt']
+ result['loc']=''
+ return result
+ if repeat:
+ cursor = db.query('select lat_txt,lon_txt,keyword from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
+ for c in cursor:
+ result['kw']=c['keyword']
+ result['lat']=c['lat_txt']
+ result['lon']=c['lon_txt']
+ return result
+def write_to_file(jsobj,fname):
+ import codecs
+ fw=codecs.open(fname,'w','utf-8')
+ fw.write(str(jsobj))
+ fw.close()
+def parsing_js(orig):
+ resultobj=[]
+ content=""
+ lines=orig.split('\n')
+ for l in lines:
+ newl=l.replace('\\"','"')
+ newl=newl.replace('\\"','"')
+ content+=newl
+ result=re.search(r'\[\["',content)
+ print(result)
+ content_begin=result.start()
+ result=re.search(r'\]\]"',content)
+ print(result)
+ content_end=result.end()
+ jscontent=content[content_begin:content_end-1]
+# write_to_file(jscontent,'c:/tmp/debug.txt')
+ jsobj=json.loads(jscontent)
+ for x in jsobj[0][1][1:]:
+ print(x[14][11])
+ print(x[14][9])
+ reviews_cnt=None
+ photo=None
+ rating=None
+ biz_id=None
+ loc_x=None
+ loc_y=None
+ addr_elmts=None
+ tel=None
+ try:
+ rating=x[14][4][7]
+ reviews_cnt=x[14][4][8]
+ except:
+ traceback.print_exc()
+ try:
+ photo=x[14][37][0][0][0]
+ num_photos=x[14][37][0][0][6][1]
+ except:
+ traceback.print_exc()
+ try:
+ loc_x=x[14][37][0][0][29][0]
+ loc_y=x[14][37][0][0][29][1]
+ except:
+ traceback.print_exc()
+ try:
+ biz_id=x[14][57][2]
+ tel=x[14][178][0][3]
+ except:
+ traceback.print_exc()
+ try:
+ addr_elmts=str(x[14][82])
+ except:
+ traceback.print_exc()
+ category=str(x[14][13])
+ topic=str(x[14][89])
+ print(x[14][13])
+ print(x[14][10])
+ print(x[14][2])
+ print(x[14][78])
+ try:
+ resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+ except:
+ traceback.print_exc()
+ return resultobj
+def save_js_to_db(jsobj,num,keyword):
+ global store_list_table
+ global iddict
+ for r in jsobj:
+ if iddict.get(r['place_id']) is not None:
+ continue
+ r['num']=num
+ r['keyword']=keyword
+ try:
+ store_list_table.insert(r)
+ except:
+ traceback.print_exc()
+def process_web_request(db, driver, area_num, keyword):
+ global prev_cnt
+ request_url = None
+ time.sleep(0.8)
+ time.sleep(3)
+ print("ppppppppp&**********************")
+ for request in driver.requests:
+ if 'search?' in request.url :
+ print('searching.....')
+ if request.response:
+ if 'search?' in request.url :
+ print('parsing js:')
+ print(request.url)
+ resp = brotli.decompress(request.response.body)
+ jstext = resp.decode('utf-8')
+ resultobj = parsing_js(jstext)
+ print("before",datetime.now())
+ print("num: "+str(area_num))
+ save_js_to_db(resultobj, area_num, keyword)
+ print("after",datetime.now())
+ aft_cnt=0
+ cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
+ for c in cursor:
+ aft_cnt=c['cnt']
+ break
+ db[TABLE_CONV_LOG].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
+ del driver.requests
+def check_area_code(db, kw):
+ table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_AREACODES)
+ result = db.query('select distinct(kw) from {}'.format(table_name))
+ result = [i['kw'] for i in result]
+ if kw not in result:
+ try:
+ sql = 'insert into {} (select num,"{}" as kw,0 as expand from {}) '.format(table_name, kw, TABLE_LAT_LON)
+ db.query(sql)
+ except:
+ traceback.print_exc()
+def page_down_(driver, time_):
+ try:
+ # action = webdriver.ActionChains(driver)
+ # element = driver.find_element_by_css_selector('a[aria-label="清除搜尋"]')
+ # print(element)
+ # height = element.size['height']
+ # width = element.size['width']
+ # action.move_to_element(element).move_by_offset(-width, height).click().perform()
+ action = webdriver.ActionChains(driver)
+ element = driver.find_element_by_css_selector('div[class="TFQHme"]')
+ action.move_to_element(element).click().perform()
+ time.sleep(1)
+ driver.back()
+ time.sleep(1)
+ for i in range(time_):
+ print(i)
+ actions = ActionChains(driver)
+ actions.send_keys(Keys.END).perform()
+ time.sleep(0.5)
+ except:
+ traceback.print_exc()
+def main():
+ global chrome_window
+ global store_list_table
+ global globalkw
+ global proxyport
+ global iddict
+ global prev_cnt
+ port=4447
+ if len(sys.argv)>1:
+ globalkw=sys.argv[1]
+ port=int(sys.argv[2])
+ proxyport=int(sys.argv[3])
+ print(globalkw, port, proxyport)
+ failcnt=0
+ localip=socket.gethostbyname(socket.gethostname())
+ db = dataset.connect('mysql://{}:{}@{}/{}?charset=utf8mb4'.format( MYSQL_CONFIG['MYSQL_USER'],
+ iddict = build_cache(db)
+ store_list_table = db[TABLE_STORE_LIST]
+ table2 = db[TABLE_PROGRESS_LIST]
+ if not chrome_window:
+ print('restart docker pw{}'.format(port))
+ os.system('sudo docker container restart pw'+str(port))
+ # os.system('docker container restart p'+str(port))
+ time.sleep(10)
+ print('drvier start...')
+ driver = brower_start(port)
+ # check_area_code(db, globalkw)
+ area_num=None
+ if len(sys.argv) > 4 :
+ repkw = sys.argv[1]
+ repnum = sys.argv[2]
+ if 'SCAN' in repkw:
+ job = scan_job(db, repnum)
+ else:
+ job = get_next_job(db, repeat=True, repkw=repkw, repnum=repnum)
+ else:
+ job = get_next_job(db, repkw=globalkw)
+ print(job)
+ keyword = job['kw']
+ latitude = job['lat'] #緯度
+ longitude = job['lon'] #精度
+ area_num = job['num']
+ safe_string = urllib.parse.quote_plus(keyword)
+ url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+ print(url)
+ prev_cnt=0
+ cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
+ for c in cursor:
+ prev_cnt = c['cnt']
+ break
+ driver.get(url)
+ time.sleep(2)
+ keyin_keyword(driver, keyword)
+ page_down_(driver, 10)
+ process_web_request(db, driver, area_num, keyword)
+ table2.insert({'kw':keyword,'num':job['num']},['kw'])
+ db.query('update {} set expand = 1 where num="'.format(TABLE_AREACODES)+str(job['num'])+'" and kw="'+keyword+'" ')
+if __name__ == '__main__':
+ main()