swire_docker_itemlist.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from datetime import datetime
  12. import dataset
  13. import time
  14. import json
  15. import re
  16. import sys, os
  17. import socket
  18. import brotli
  19. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  20. import urllib.parse
  21. #chrome_window=False
  22. chrome_window=True
  23. globalkw=None
  24. proxyport=8787
  25. def build_cache(db):
  26. id_dict={}
  27. cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;')
  28. for c in cursor:
  29. id_dict[c['place_id']]=1
  30. return id_dict
  31. #
  32. def brower_start(port):
  33. global proxyport
  34. global chrome_window
  35. print(proxyport)
  36. options = webdriver.ChromeOptions()
  37. if chrome_window:
  38. options.add_argument('--ignore-certificate-errors')
  39. options.add_argument("--no-sandbox")
  40. options.add_argument("--headless")
  41. options.add_argument("--disable-dev-shm-usage")
  42. browser = webdriver.Chrome(
  43. desired_capabilities=options.to_capabilities()
  44. )
  45. else:
  46. chrome_options = webdriver.ChromeOptions()
  47. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  48. chrome_options.add_argument('--ignore-certificate-errors')
  49. chrome_options.add_argument("--no-sandbox")
  50. chrome_options.add_argument("--disable-dev-shm-usage")
  51. browser = webdriver.Remote(
  52. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  53. desired_capabilities=chrome_options.to_capabilities(),
  54. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  55. )
  56. # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
  57. browser.set_window_size(1400,1000)
  58. return browser
  59. def page_down_(driver, xpath_css, time_):
  60. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  61. result_count = e.text.split('-')[1].replace(' 項結果','')
  62. print(result_count)
  63. if int(result_count) > 5:
  64. for i in range(time_):
  65. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  66. action = webdriver.common.action_chains.ActionChains(driver)
  67. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  68. action.click()
  69. action.perform()
  70. time.sleep(0.5)
  71. def get_url_list(driver):
  72. page_down_(driver, '//div[@class="TFQHme"]', 8)
  73. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  74. url_list = []
  75. for i in url_soup.find_all('a'):
  76. try:
  77. if i['href'].find('maps/place') != -1:
  78. url_list += [[i['href'], i['aria-label']]]
  79. except:
  80. pass
  81. # print(len(url_list))
  82. return url_list
  83. def keyin_keyword(driver, keyword):
  84. button = driver.find_element_by_id("searchbox")
  85. driver.implicitly_wait(30)
  86. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  87. time.sleep(3)
  88. def scan_job(db,kw):
  89. result={'kw':kw}
  90. cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid FROM google_poi.conv_log group by num ) t2 where t1.id=t2.mid having diff>0 order by rand()')
  91. for c in cursor:
  92. result['num']=c['num']
  93. break
  94. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  95. for c in cursor:
  96. result['lat']=c['lat']
  97. result['lon']=c['lon']
  98. result['loc']=c['loc']
  99. return result
  100. def get_next_job(db,repeat=False,repkw=None,repnum=None):
  101. global globalkw
  102. result={}
  103. # if globalkw is not None:
  104. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"')
  105. # else:
  106. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')
  107. # cursor = db.query('select kw,num from areacodes where expand=0 order by rand()')
  108. cursor = db.query('select kw,num from areacodes order by rand()')
  109. for c in cursor:
  110. # repkw=c['kw']
  111. if repkw is None:
  112. repkw=c['kw']
  113. result['kw']=c['kw']
  114. result['num']=c['num']
  115. break
  116. if repkw is not None:
  117. result['kw']=repkw
  118. if result.get('num') is not None:
  119. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  120. for c in cursor:
  121. result['lat']=c['lat']
  122. result['lon']=c['lon']
  123. result['loc']=c['loc']
  124. break
  125. if repeat and repkw!= 'REP':
  126. result['kw']=repkw
  127. result['num']=repnum
  128. if 'REP' in repkw:
  129. if repnum=='REP':
  130. repnum=None
  131. # cursor = db.query('select num from swire_store_list where num not in (select num from conv_log) order by rand() limit 1')
  132. cursor = db.query('select num from swire_store_list order by rand() limit 1')
  133. for c in cursor:
  134. repnum=c['num']
  135. break
  136. if repnum is None:
  137. cursor = db.query('select num from swire_store_list order by rand() limit 1')
  138. for c in cursor:
  139. repnum=c['num']
  140. break
  141. # cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1')
  142. cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1')
  143. for c in cursor:
  144. result['kw']=c['keyword']
  145. result['num']=c['num']
  146. result['lat']=c['lat_txt']
  147. result['lon']=c['lon_txt']
  148. result['loc']=''
  149. return result
  150. if repeat:
  151. # cursor = db.query('select lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
  152. cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
  153. for c in cursor:
  154. result['kw']=c['keyword']
  155. result['lat']=c['lat_txt']
  156. result['lon']=c['lon_txt']
  157. return result
  158. def write_to_file(jsobj,fname):
  159. import codecs
  160. fw=codecs.open(fname,'w','utf-8')
  161. fw.write(str(jsobj))
  162. fw.close()
  163. def parsing_js(orig):
  164. resultobj=[]
  165. content=""
  166. lines=orig.split('\n')
  167. for l in lines:
  168. newl=l.replace('\\"','"')
  169. # if '\\\\"' in newl:
  170. # print(newl)
  171. # newl=newl.repace('\\\\"','')
  172. newl=newl.replace('\\"','"')
  173. content+=newl
  174. result=re.search(r'\[\["',content)
  175. print(result)
  176. content_begin=result.start()
  177. result=re.search(r'\]\]"',content)
  178. print(result)
  179. content_end=result.end()
  180. jscontent=content[content_begin:content_end-1]
  181. # write_to_file(jscontent,'c:/tmp/debug.txt')
  182. jsobj=json.loads(jscontent)
  183. for x in jsobj[0][1][1:]:
  184. print(x[14][11])
  185. print(x[14][9])
  186. reviews_cnt=None
  187. photo=None
  188. rating=None
  189. biz_id=None
  190. loc_x=None
  191. loc_y=None
  192. addr_elmts=None
  193. tel=None
  194. try:
  195. rating=x[14][4][7]
  196. reviews_cnt=x[14][4][8]
  197. except:
  198. traceback.print_exc()
  199. try:
  200. photo=x[14][37][0][0][0]
  201. num_photos=x[14][37][0][0][6][1]
  202. except:
  203. traceback.print_exc()
  204. try:
  205. loc_x=x[14][37][0][0][29][0]
  206. loc_y=x[14][37][0][0][29][1]
  207. except:
  208. traceback.print_exc()
  209. try:
  210. biz_id=x[14][57][2]
  211. tel=x[14][178][0][3]
  212. except:
  213. traceback.print_exc()
  214. try:
  215. addr_elmts=str(x[14][82])
  216. except:
  217. traceback.print_exc()
  218. category=str(x[14][13])
  219. topic=str(x[14][89])
  220. print(x[14][13])
  221. print(x[14][10])
  222. print(x[14][2])
  223. print(x[14][78])
  224. try:
  225. resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  226. except:
  227. traceback.print_exc()
  228. return resultobj
  229. def save_js_to_db(jsobj,num,keyword):
  230. global store_list_table
  231. global iddict
  232. for r in jsobj:
  233. if iddict.get(r['place_id']) is not None:
  234. continue
  235. r['num']=num
  236. r['keyword']=keyword
  237. try:
  238. store_list_table.insert(r)
  239. # store_list_table.upsert(r,keys=['place_id'])
  240. except:
  241. traceback.print_exc()
  242. # store_list_table.upsert(r,keys=['place_id'])
  243. def process_web_request(db,driver,area_num,keyword):
  244. global prev_cnt
  245. # query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
  246. time.sleep(0.8)
  247. time.sleep(3)
  248. print("ppppppppp&**********************")
  249. for request in driver.requests:
  250. if 'search?' in request.url :
  251. print('searching.....')
  252. # else:
  253. # print(request.url[20:60])
  254. if request.response:
  255. # if 'https://www.google.com.tw/search?tbm=map' in request.url :
  256. if 'search?' in request.url :
  257. print('parsing js:')
  258. resp = brotli.decompress(request.response.body)
  259. jstext=resp.decode('utf-8')
  260. resultobj=parsing_js(jstext)
  261. print("before",datetime.now())
  262. print("num: "+str(area_num))
  263. save_js_to_db(resultobj,area_num,keyword)
  264. print("after",datetime.now())
  265. aft_cnt=0
  266. cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  267. for c in cursor:
  268. aft_cnt=c['cnt']
  269. break
  270. db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
  271. # time.sleep(9999)
  272. def main():
  273. global chrome_window
  274. global store_list_table
  275. global globalkw
  276. global proxyport
  277. global iddict
  278. global prev_cnt
  279. port=4444
  280. # if len(sys.argv) == 3 :
  281. # port=int(sys.argv[1])
  282. # proxyport=int(sys.argv[2])
  283. if len(sys.argv)>1:
  284. globalkw=sys.argv[1]
  285. port=int(sys.argv[2])
  286. proxyport=int(sys.argv[3])
  287. print(globalkw, port, proxyport)
  288. failcnt=0
  289. localip=socket.gethostbyname(socket.gethostname())
  290. # if localip=='192.168.1.108':
  291. # chrome_window=True
  292. # chrome_window=False
  293. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  294. iddict=build_cache(db)
  295. store_list_table = db['swire_store_list']
  296. # table2 = db['swire_progress_list']
  297. table2 = db['swire_area_progress']
  298. if not chrome_window:
  299. print('restart docker p{}'.format(port))
  300. # os.system('sudo docker container restart p'+str(port))
  301. os.system('docker container restart p'+str(port))
  302. time.sleep(10)
  303. print('drvier start...')
  304. driver = brower_start(port)
  305. area_num=None
  306. while True:
  307. try:
  308. if len(sys.argv) > 4 :
  309. repkw=sys.argv[1]
  310. repnum=sys.argv[2]
  311. if 'SCAN' in repkw:
  312. job=scan_job(db,repnum)
  313. else:
  314. job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum)
  315. else:
  316. job=get_next_job(db, repkw=globalkw)
  317. print(job)
  318. keyword = job['kw']
  319. latitude = job['lat'] #緯度
  320. longitude = job['lon'] #精度
  321. area_num=job['num']
  322. safe_string = urllib.parse.quote_plus(keyword)
  323. url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
  324. prev_cnt=0
  325. cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  326. for c in cursor:
  327. prev_cnt=c['cnt']
  328. break
  329. # url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
  330. # url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
  331. # print(url)
  332. # url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
  333. driver.get(url)
  334. # time.sleep(3)
  335. keyin_keyword(driver, keyword)
  336. process_web_request(db,driver,area_num,keyword)
  337. pagecnt=0
  338. while True:
  339. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  340. if element.get_attribute('disabled'):
  341. break
  342. # driver.implicitly_wait(30)
  343. ActionChains(driver).move_to_element(element).click(element).perform()
  344. process_web_request(db,driver,area_num,keyword)
  345. pagecnt+=1
  346. if pagecnt>=5:
  347. break
  348. # table2.upsert({'kw':keyword,'num':job['num']},['kw'])
  349. table2.insert({'kw':keyword,'num':job['num']},['kw'])
  350. db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
  351. except:
  352. traceback.print_exc()
  353. failcnt+=1
  354. if failcnt>=15:
  355. sys.exit()
  356. pass
  357. if __name__ == '__main__':
  358. main()