swire_docker_itemlist.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from utility import database_access as DA
  13. from utility.parseutils import *
  14. from utility.connect import *
  15. from datetime import datetime
  16. import pandas as pd
  17. import dataset
  18. import time
  19. import json
  20. import re
  21. import sys, os
  22. import socket
  23. import brotli
  24. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  25. import urllib.parse
  26. #chrome_window=False
  27. chrome_window=True
  28. globalkw=None
  29. proxyport=8787
  30. def build_cache(db):
  31. id_dict={}
  32. cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;')
  33. for c in cursor:
  34. id_dict[c['place_id']]=1
  35. return id_dict
  36. #
  37. def brower_start(port):
  38. global proxyport
  39. global chrome_window
  40. print(proxyport)
  41. options = webdriver.ChromeOptions()
  42. if chrome_window:
  43. options.add_argument('--ignore-certificate-errors')
  44. options.add_argument("--no-sandbox")
  45. options.add_argument("--headless")
  46. options.add_argument("--disable-dev-shm-usage")
  47. browser = webdriver.Chrome(
  48. desired_capabilities=options.to_capabilities()
  49. )
  50. else:
  51. chrome_options = webdriver.ChromeOptions()
  52. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  53. chrome_options.add_argument('--ignore-certificate-errors')
  54. chrome_options.add_argument("--no-sandbox")
  55. chrome_options.add_argument("--disable-dev-shm-usage")
  56. browser = webdriver.Remote(
  57. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  58. desired_capabilities=chrome_options.to_capabilities(),
  59. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  60. )
  61. # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
  62. browser.set_window_size(1400,1000)
  63. return browser
  64. def page_down_(driver, xpath_css, time_):
  65. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  66. result_count = e.text.split('-')[1].replace(' 項結果','')
  67. print(result_count)
  68. if int(result_count) > 5:
  69. for i in range(time_):
  70. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  71. action = webdriver.common.action_chains.ActionChains(driver)
  72. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  73. action.click()
  74. action.perform()
  75. time.sleep(0.5)
  76. def get_url_list(driver):
  77. page_down_(driver, '//div[@class="TFQHme"]', 8)
  78. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  79. url_list = []
  80. for i in url_soup.find_all('a'):
  81. try:
  82. if i['href'].find('maps/place') != -1:
  83. url_list += [[i['href'], i['aria-label']]]
  84. except:
  85. pass
  86. # print(len(url_list))
  87. return url_list
  88. def keyin_keyword(driver, keyword):
  89. button = driver.find_element_by_id("searchbox")
  90. driver.implicitly_wait(30)
  91. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  92. time.sleep(3)
  93. def scan_job(db,kw):
  94. result={'kw':kw}
  95. cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid FROM google_poi.conv_log group by num ) t2 where t1.id=t2.mid having diff>0 order by rand()')
  96. for c in cursor:
  97. result['num']=c['num']
  98. break
  99. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  100. for c in cursor:
  101. result['lat']=c['lat']
  102. result['lon']=c['lon']
  103. result['loc']=c['loc']
  104. return result
  105. def get_next_job(db,repeat=False,repkw=None,repnum=None):
  106. global globalkw
  107. result={}
  108. # if globalkw is not None:
  109. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"')
  110. # else:
  111. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')
  112. # cursor = db.query('select kw,num from areacodes where expand=0 order by rand()')
  113. cursor = db.query('select kw,num from areacodes order by rand()')
  114. for c in cursor:
  115. # repkw=c['kw']
  116. if repkw is None:
  117. repkw=c['kw']
  118. result['kw']=c['kw']
  119. result['num']=c['num']
  120. break
  121. if repkw is not None:
  122. result['kw']=repkw
  123. if result.get('num') is not None:
  124. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  125. for c in cursor:
  126. result['lat']=c['lat']
  127. result['lon']=c['lon']
  128. result['loc']=c['loc']
  129. break
  130. if repeat and repkw!= 'REP':
  131. result['kw']=repkw
  132. result['num']=repnum
  133. if 'REP' in repkw:
  134. if repnum=='REP':
  135. repnum=None
  136. # cursor = db.query('select num from swire_store_list where num not in (select num from conv_log) order by rand() limit 1')
  137. cursor = db.query('select num from swire_store_list order by rand() limit 1')
  138. for c in cursor:
  139. repnum=c['num']
  140. break
  141. if repnum is None:
  142. cursor = db.query('select num from swire_store_list order by rand() limit 1')
  143. for c in cursor:
  144. repnum=c['num']
  145. break
  146. # cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1')
  147. cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1')
  148. for c in cursor:
  149. result['kw']=c['keyword']
  150. result['num']=c['num']
  151. result['lat']=c['lat_txt']
  152. result['lon']=c['lon_txt']
  153. result['loc']=''
  154. return result
  155. if repeat:
  156. # cursor = db.query('select lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
  157. cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
  158. for c in cursor:
  159. result['kw']=c['keyword']
  160. result['lat']=c['lat_txt']
  161. result['lon']=c['lon_txt']
  162. return result
  163. def write_to_file(jsobj,fname):
  164. import codecs
  165. fw=codecs.open(fname,'w','utf-8')
  166. fw.write(str(jsobj))
  167. fw.close()
  168. def parsing_js(orig):
  169. resultobj=[]
  170. content=""
  171. lines=orig.split('\n')
  172. for l in lines:
  173. newl=l.replace('\\"','"')
  174. # if '\\\\"' in newl:
  175. # print(newl)
  176. # newl=newl.repace('\\\\"','')
  177. newl=newl.replace('\\"','"')
  178. content+=newl
  179. result=re.search(r'\[\["',content)
  180. print(result)
  181. content_begin=result.start()
  182. result=re.search(r'\]\]"',content)
  183. print(result)
  184. content_end=result.end()
  185. jscontent=content[content_begin:content_end-1]
  186. # write_to_file(jscontent,'c:/tmp/debug.txt')
  187. jsobj=json.loads(jscontent)
  188. for x in jsobj[0][1][1:]:
  189. print(x[14][11])
  190. print(x[14][9])
  191. reviews_cnt=None
  192. photo=None
  193. rating=None
  194. biz_id=None
  195. loc_x=None
  196. loc_y=None
  197. addr_elmts=None
  198. tel=None
  199. try:
  200. rating=x[14][4][7]
  201. reviews_cnt=x[14][4][8]
  202. except:
  203. traceback.print_exc()
  204. try:
  205. photo=x[14][37][0][0][0]
  206. num_photos=x[14][37][0][0][6][1]
  207. except:
  208. traceback.print_exc()
  209. try:
  210. loc_x=x[14][37][0][0][29][0]
  211. loc_y=x[14][37][0][0][29][1]
  212. except:
  213. traceback.print_exc()
  214. try:
  215. biz_id=x[14][57][2]
  216. tel=x[14][178][0][3]
  217. except:
  218. traceback.print_exc()
  219. try:
  220. addr_elmts=str(x[14][82])
  221. except:
  222. traceback.print_exc()
  223. category=str(x[14][13])
  224. topic=str(x[14][89])
  225. print(x[14][13])
  226. print(x[14][10])
  227. print(x[14][2])
  228. print(x[14][78])
  229. try:
  230. resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  231. except:
  232. traceback.print_exc()
  233. return resultobj
  234. def save_js_to_db(jsobj,num,keyword):
  235. global store_list_table
  236. global iddict
  237. for r in jsobj:
  238. if iddict.get(r['place_id']) is not None:
  239. continue
  240. r['num']=num
  241. r['keyword']=keyword
  242. try:
  243. store_list_table.insert(r)
  244. # store_list_table.upsert(r,keys=['place_id'])
  245. except:
  246. traceback.print_exc()
  247. # store_list_table.upsert(r,keys=['place_id'])
  248. def process_web_request(db,driver,area_num,keyword):
  249. global prev_cnt
  250. # query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
  251. time.sleep(0.8)
  252. time.sleep(3)
  253. print("ppppppppp&**********************")
  254. for request in driver.requests:
  255. if 'search?' in request.url :
  256. print('searching.....')
  257. # else:
  258. # print(request.url[20:60])
  259. if request.response:
  260. # if 'https://www.google.com.tw/search?tbm=map' in request.url :
  261. if 'search?' in request.url :
  262. print('parsing js:')
  263. resp = brotli.decompress(request.response.body)
  264. jstext=resp.decode('utf-8')
  265. resultobj=parsing_js(jstext)
  266. print("before",datetime.now())
  267. print("num: "+str(area_num))
  268. save_js_to_db(resultobj,area_num,keyword)
  269. print("after",datetime.now())
  270. aft_cnt=0
  271. cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  272. for c in cursor:
  273. aft_cnt=c['cnt']
  274. break
  275. db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
  276. # time.sleep(9999)
  277. def main():
  278. global chrome_window
  279. global store_list_table
  280. global globalkw
  281. global proxyport
  282. global iddict
  283. global prev_cnt
  284. port=4444
  285. # if len(sys.argv) == 3 :
  286. # port=int(sys.argv[1])
  287. # proxyport=int(sys.argv[2])
  288. if len(sys.argv)>1:
  289. globalkw=sys.argv[1]
  290. port=int(sys.argv[2])
  291. proxyport=int(sys.argv[3])
  292. print(globalkw, port, proxyport)
  293. failcnt=0
  294. localip=socket.gethostbyname(socket.gethostname())
  295. # if localip=='192.168.1.108':
  296. # chrome_window=True
  297. # chrome_window=False
  298. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  299. iddict=build_cache(db)
  300. store_list_table = db['swire_store_list']
  301. # table2 = db['swire_progress_list']
  302. table2 = db['swire_area_progress']
  303. if not chrome_window:
  304. print('restart docker p{}'.format(port))
  305. # os.system('sudo docker container restart p'+str(port))
  306. os.system('docker container restart p'+str(port))
  307. time.sleep(10)
  308. print('drvier start...')
  309. driver = brower_start(port)
  310. area_num=None
  311. while True:
  312. try:
  313. if len(sys.argv) > 4 :
  314. repkw=sys.argv[1]
  315. repnum=sys.argv[2]
  316. if 'SCAN' in repkw:
  317. job=scan_job(db,repnum)
  318. else:
  319. job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum)
  320. else:
  321. job=get_next_job(db, repkw=globalkw)
  322. print(job)
  323. keyword = job['kw']
  324. latitude = job['lat'] #緯度
  325. longitude = job['lon'] #精度
  326. area_num=job['num']
  327. safe_string = urllib.parse.quote_plus(keyword)
  328. url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
  329. prev_cnt=0
  330. cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  331. for c in cursor:
  332. prev_cnt=c['cnt']
  333. break
  334. # url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
  335. # url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
  336. # print(url)
  337. # url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
  338. driver.get(url)
  339. # time.sleep(3)
  340. keyin_keyword(driver, keyword)
  341. process_web_request(db,driver,area_num,keyword)
  342. pagecnt=0
  343. while True:
  344. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  345. if element.get_attribute('disabled'):
  346. break
  347. # driver.implicitly_wait(30)
  348. ActionChains(driver).move_to_element(element).click(element).perform()
  349. process_web_request(db,driver,area_num,keyword)
  350. pagecnt+=1
  351. if pagecnt>=5:
  352. break
  353. # table2.upsert({'kw':keyword,'num':job['num']},['kw'])
  354. table2.insert({'kw':keyword,'num':job['num']},['kw'])
  355. db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
  356. except:
  357. traceback.print_exc()
  358. failcnt+=1
  359. if failcnt>=15:
  360. sys.exit()
  361. pass
  362. if __name__ == '__main__':
  363. main()