swire_docker_itemlist.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from datetime import datetime
  12. import dataset
  13. import time
  14. import json
  15. import gzip
  16. import re
  17. import sys, os
  18. import socket
  19. import brotli
  20. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  21. import urllib.parse
  22. from seleniumwire.utils import decode as sw_decode
  23. #chrome_window=False
  24. chrome_window=True
  25. globalkw=None
  26. proxyport=8787
  27. def build_cache(db):
  28. id_dict={}
  29. cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;')
  30. for c in cursor:
  31. id_dict[c['place_id']]=1
  32. return id_dict
  33. #
  34. def brower_start(port):
  35. global proxyport
  36. global chrome_window
  37. print(proxyport)
  38. options = webdriver.ChromeOptions()
  39. if chrome_window:
  40. options.add_argument('--ignore-certificate-errors')
  41. options.add_argument("--no-sandbox")
  42. options.add_argument("--headless")
  43. options.add_argument("--disable-gpu")
  44. options.add_argument("--disable-dev-shm-usage")
  45. browser = webdriver.Chrome(
  46. options=options
  47. # ,seleniumwire_options={'disable_encoding': True}
  48. # desired_capabilities=options.to_capabilities()
  49. )
  50. browser.set_window_size(1400,1000)
  51. else:
  52. chrome_options = webdriver.ChromeOptions()
  53. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  54. chrome_options.add_argument('--ignore-certificate-errors')
  55. chrome_options.add_argument("--no-sandbox")
  56. chrome_options.add_argument("--disable-dev-shm-usage")
  57. browser = webdriver.Remote(
  58. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  59. desired_capabilities=chrome_options.to_capabilities(),
  60. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  61. )
  62. # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
  63. browser.set_window_size(1400,1000)
  64. return browser
  65. def page_down_(driver, xpath_css, time_):
  66. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  67. result_count = e.text.split('-')[1].replace(' 項結果','')
  68. print(result_count)
  69. if int(result_count) > 5:
  70. for i in range(time_):
  71. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  72. action = webdriver.common.action_chains.ActionChains(driver)
  73. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  74. action.click()
  75. action.perform()
  76. time.sleep(0.5)
  77. def keyin_keyword(driver, keyword):
  78. print('key in keyword:' +keyword)
  79. button = driver.find_element_by_id("searchbox")
  80. driver.implicitly_wait(30)
  81. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  82. time.sleep(3)
  83. def scan_job(db,kw):
  84. result={'kw':kw}
  85. cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid FROM google_poi.conv_log group by num ) t2 where t1.id=t2.mid having diff>0 order by rand()')
  86. for c in cursor:
  87. result['num']=c['num']
  88. break
  89. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  90. for c in cursor:
  91. result['lat']=c['lat']
  92. result['lon']=c['lon']
  93. result['loc']=c['loc']
  94. return result
  95. def get_next_job(db,repeat=False,repkw=None,repnum=None):
  96. global globalkw
  97. result={}
  98. # if globalkw is not None:
  99. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"')
  100. # else:
  101. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')
  102. # cursor = db.query('select kw,num from areacodes where expand=0 order by rand()')
  103. cursor = db.query('select kw,num from areacodes order by rand()')
  104. for c in cursor:
  105. # repkw=c['kw']
  106. if repkw is None:
  107. repkw=c['kw']
  108. result['kw']=c['kw']
  109. result['num']=c['num']
  110. break
  111. if repkw is not None:
  112. result['kw']=repkw
  113. if result.get('num') is not None:
  114. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  115. for c in cursor:
  116. result['lat']=c['lat']
  117. result['lon']=c['lon']
  118. result['loc']=c['loc']
  119. break
  120. if repeat and repkw!= 'REP':
  121. result['kw']=repkw
  122. result['num']=repnum
  123. # if 'REP' in repkw:
  124. # if repnum=='REP':
  125. # repnum=None
  126. # cursor = db.query('select num from swire_store_list order by rand() limit 1')
  127. # for c in cursor:
  128. # repnum=c['num']
  129. # break
  130. # if repnum is None:
  131. # cursor = db.query('select num from swire_store_list order by rand() limit 1')
  132. # for c in cursor:
  133. # repnum=c['num']
  134. # break
  135. # cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1')
  136. # cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1')
  137. # for c in cursor:
  138. # result['kw']=c['keyword']
  139. # result['num']=c['num']
  140. # result['lat']=c['lat_txt']
  141. # result['lon']=c['lon_txt']
  142. # result['loc']=''
  143. # return result
  144. if repeat:
  145. # cursor = db.query('select lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
  146. # cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
  147. cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where keyword <> "火鍋餐廳" order by rand() limit 1')
  148. for c in cursor:
  149. result['kw']=c['keyword']
  150. result['lat']=c['lat_txt']
  151. result['lon']=c['lon_txt']
  152. result['num']=c['num']
  153. result['loc']=''
  154. return result
  155. def write_to_file(jsobj,fname):
  156. import codecs
  157. fw=codecs.open(fname,'w','utf-8')
  158. fw.write(str(jsobj))
  159. fw.close()
  160. def parsing_js(orig):
  161. resultobj=[]
  162. content=""
  163. lines=orig.split('\n')
  164. for l in lines:
  165. newl=l.replace('\\"','"')
  166. # if '\\\\"' in newl:
  167. # print(newl)
  168. # newl=newl.repace('\\\\"','')
  169. newl=newl.replace('\\"','"')
  170. content+=newl
  171. result=re.search(r'\[\["',content)
  172. print(result)
  173. content_begin=result.start()
  174. result=re.search(r'\]\]"',content)
  175. print(result)
  176. content_end=result.end()
  177. jscontent=content[content_begin:content_end-1]
  178. # write_to_file(jscontent,'c:/tmp/debug.txt')
  179. # write_to_file(jscontent,'c:/tmp/headless.txt')
  180. jsobj=json.loads(jscontent)
  181. for x in jsobj[0][1][1:]:
  182. print(x[14][11])
  183. print(x[14][9])
  184. reviews_cnt=None
  185. photo=None
  186. rating=None
  187. biz_id=None
  188. loc_x=None
  189. loc_y=None
  190. addr_elmts=None
  191. tel=None
  192. try:
  193. rating=x[14][4][7]
  194. reviews_cnt=x[14][4][8]
  195. except:
  196. traceback.print_exc()
  197. try:
  198. photo=x[14][37][0][0][0]
  199. num_photos=x[14][37][0][0][6][1]
  200. except:
  201. traceback.print_exc()
  202. try:
  203. loc_x=x[14][37][0][0][29][0]
  204. loc_y=x[14][37][0][0][29][1]
  205. except:
  206. traceback.print_exc()
  207. try:
  208. biz_id=x[14][57][2]
  209. tel=x[14][178][0][3]
  210. except:
  211. traceback.print_exc()
  212. try:
  213. addr_elmts=str(x[14][82])
  214. except:
  215. traceback.print_exc()
  216. category=str(x[14][13])
  217. topic=str(x[14][89])
  218. print(x[14][13])
  219. print(x[14][10])
  220. print(x[14][2])
  221. print(x[14][78])
  222. try:
  223. resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  224. except:
  225. traceback.print_exc()
  226. return resultobj
  227. def save_js_to_db(jsobj,num,keyword):
  228. global store_list_table
  229. global iddict
  230. for r in jsobj:
  231. if iddict.get(r['place_id']) is not None:
  232. continue
  233. r['num']=num
  234. r['keyword']=keyword
  235. try:
  236. store_list_table.insert(r)
  237. # store_list_table.upsert(r,keys=['place_id'])
  238. except:
  239. traceback.print_exc()
  240. # store_list_table.upsert(r,keys=['place_id'])
  241. def process_web_request(db,driver,area_num,keyword):
  242. global prev_cnt
  243. # query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
  244. time.sleep(0.8)
  245. time.sleep(3)
  246. print("ppppppppp&**********************")
  247. for request in driver.requests:
  248. if 'search?' in request.url :
  249. print('searching.....')
  250. # else:
  251. # print(request.url[20:60])
  252. if request.response:
  253. # if 'https://www.google.com.tw/search?tbm=map' in request.url :
  254. if 'search?' in request.url :
  255. print('parsing js:')
  256. # resp=request.response.body
  257. # resp = sw_decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
  258. # data = data.decode("utf8")
  259. # print(request.response.header)
  260. # sys.exit()
  261. # driver.quit()
  262. resp = request.response.body
  263. print(request.response.headers.get('Content-Encoding'))
  264. if 'gzip' in request.response.headers.get('Content-Encoding'):
  265. resp = gzip.decompress(request.response.body)
  266. if 'br' in request.response.headers.get('Content-Encoding'):
  267. resp = brotli.decompress(request.response.body)
  268. jstext=resp.decode('utf-8')
  269. resultobj=parsing_js(jstext)
  270. print("before",datetime.now())
  271. print("num: "+str(area_num))
  272. save_js_to_db(resultobj,area_num,keyword)
  273. print("after",datetime.now())
  274. # aft_cnt=0
  275. # cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  276. # for c in cursor:
  277. # aft_cnt=c['cnt']
  278. # break
  279. # db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
  280. del driver.requests
  281. # time.sleep(9999)
  282. def main():
  283. global chrome_window
  284. global store_list_table
  285. global globalkw
  286. global proxyport
  287. global iddict
  288. global prev_cnt
  289. port=4444
  290. # if len(sys.argv) == 3 :
  291. # port=int(sys.argv[1])
  292. # proxyport=int(sys.argv[2])
  293. if len(sys.argv)>1:
  294. globalkw=sys.argv[1]
  295. port=int(sys.argv[2])
  296. proxyport=int(sys.argv[3])
  297. print(globalkw, port, proxyport)
  298. failcnt=0
  299. localip=socket.gethostbyname(socket.gethostname())
  300. # if localip=='192.168.1.108':
  301. # chrome_window=True
  302. # chrome_window=False
  303. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  304. iddict=build_cache(db)
  305. store_list_table = db['swire_store_list']
  306. # table2 = db['swire_progress_list']
  307. table2 = db['swire_area_progress']
  308. if not chrome_window:
  309. print('restart docker p{}'.format(port))
  310. # os.system('sudo docker container restart p'+str(port))
  311. os.system('docker container restart p'+str(port))
  312. time.sleep(10)
  313. print('drvier start...')
  314. driver = brower_start(port)
  315. area_num=None
  316. repeating=False
  317. while True:
  318. try:
  319. if len(sys.argv) > 4 :
  320. repkw=sys.argv[1]
  321. repnum=sys.argv[2]
  322. if 'SCAN' in repkw:
  323. job=scan_job(db,repnum)
  324. else:
  325. repeating=True
  326. job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum)
  327. else:
  328. job=get_next_job(db, repkw=globalkw)
  329. print(job)
  330. keyword = job['kw']
  331. latitude = job['lat'] #緯度
  332. longitude = job['lon'] #精度
  333. area_num=job['num']
  334. safe_string = urllib.parse.quote_plus(keyword)
  335. url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
  336. # prev_cnt=0
  337. # cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  338. # for c in cursor:
  339. # prev_cnt=c['cnt']
  340. # break
  341. # url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
  342. # url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
  343. # print(url)
  344. # url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
  345. driver.get(url)
  346. time.sleep(3)
  347. keyin_keyword(driver, keyword)
  348. process_web_request(db,driver,area_num,keyword)
  349. pagecnt=0
  350. while True:
  351. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  352. if element.get_attribute('disabled'):
  353. break
  354. # driver.implicitly_wait(30)
  355. ActionChains(driver).move_to_element(element).click(element).perform()
  356. process_web_request(db,driver,area_num,keyword)
  357. if repeating:
  358. break
  359. pagecnt+=1
  360. if pagecnt>=5:
  361. break
  362. # table2.upsert({'kw':keyword,'num':job['num']},['kw'])
  363. table2.insert({'kw':keyword,'num':job['num']},['kw'])
  364. db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
  365. except:
  366. traceback.print_exc()
  367. failcnt+=1
  368. if failcnt>=15:
  369. sys.exit()
  370. pass
  371. if __name__ == '__main__':
  372. main()