swire_shop_item_list.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from utility import database_access as DA
  13. from utility.parseutils import *
  14. from utility.connect import *
  15. from datetime import datetime
  16. import pandas as pd
  17. import dataset
  18. import time
  19. import json
  20. import re
  21. import sys, os
  22. import socket
  23. import brotli
  24. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  25. import urllib.parse
  26. chrome_window=False
  27. globalkw=None
  28. proxyport=8787
  29. def build_cache(db):
  30. id_dict={}
  31. cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;')
  32. for c in cursor:
  33. id_dict[c['place_id']]=1
  34. return id_dict
  35. #
  36. def brower_start(port):
  37. global proxyport
  38. global chrome_window
  39. print(proxyport)
  40. options = webdriver.ChromeOptions()
  41. if chrome_window:
  42. browser = webdriver.Chrome(
  43. desired_capabilities=options.to_capabilities()
  44. )
  45. else:
  46. chrome_options = webdriver.ChromeOptions()
  47. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  48. chrome_options.add_argument('--ignore-certificate-errors')
  49. chrome_options.add_argument("--no-sandbox")
  50. chrome_options.add_argument("--disable-dev-shm-usage")
  51. browser = webdriver.Remote(
  52. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  53. desired_capabilities=chrome_options.to_capabilities(),
  54. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  55. )
  56. # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
  57. browser.set_window_size(1400,1000)
  58. return browser
  59. def page_down_(driver, xpath_css, time_):
  60. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  61. result_count = e.text.split('-')[1].replace(' 項結果','')
  62. print(result_count)
  63. if int(result_count) > 5:
  64. for i in range(time_):
  65. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  66. action = webdriver.common.action_chains.ActionChains(driver)
  67. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  68. action.click()
  69. action.perform()
  70. time.sleep(0.5)
  71. def get_url_list(driver):
  72. page_down_(driver, '//div[@class="TFQHme"]', 8)
  73. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  74. url_list = []
  75. for i in url_soup.find_all('a'):
  76. try:
  77. if i['href'].find('maps/place') != -1:
  78. url_list += [[i['href'], i['aria-label']]]
  79. except:
  80. pass
  81. # print(len(url_list))
  82. return url_list
  83. def keyin_keyword(driver, keyword):
  84. button = driver.find_element_by_id("searchbox")
  85. driver.implicitly_wait(30)
  86. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  87. time.sleep(3)
  88. def get_next_job(db,repeat=False,repkw=None,repnum=None):
  89. global globalkw
  90. result={}
  91. # if globalkw is not None:
  92. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"')
  93. # else:
  94. # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')
  95. cursor = db.query('select kw,num from areacodes where expand=0 order by rand()')
  96. for c in cursor:
  97. result['kw']=c['kw']
  98. result['num']=c['num']
  99. break
  100. if result.get('num') is not None:
  101. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  102. for c in cursor:
  103. result['lat']=c['lat']
  104. result['lon']=c['lon']
  105. result['loc']=c['loc']
  106. break
  107. if repeat and repkw!= 'REP':
  108. result['kw']=repkw
  109. result['num']=repnum
  110. if 'REP' in repkw:
  111. repnum=None
  112. cursor = db.query('select num from swire_store_list where num not in (select num from conv_log) order by rand() limit 1')
  113. for c in cursor:
  114. repnum=c['num']
  115. break
  116. if repnum is None:
  117. cursor = db.query('select num from swire_store_list order by rand() limit 1')
  118. for c in cursor:
  119. repnum=c['num']
  120. break
  121. # cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1')
  122. cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1')
  123. for c in cursor:
  124. result['kw']=c['keyword']
  125. result['num']=c['num']
  126. result['lat']=c['lat_txt']
  127. result['lon']=c['lon_txt']
  128. result['loc']=''
  129. return result
  130. # if repeat:
  131. # cursor = db.query('select lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
  132. # for c in cursor:
  133. # result['lat']=c['lat_txt']
  134. # result['lon']=c['lon_txt']
  135. return result
  136. def write_to_file(jsobj,fname):
  137. import codecs
  138. fw=codecs.open(fname,'w','utf-8')
  139. fw.write(str(jsobj))
  140. fw.close()
  141. def parsing_js(orig):
  142. resultobj=[]
  143. content=""
  144. lines=orig.split('\n')
  145. for l in lines:
  146. newl=l.replace('\\"','"')
  147. # if '\\\\"' in newl:
  148. # print(newl)
  149. # newl=newl.repace('\\\\"','')
  150. newl=newl.replace('\\"','"')
  151. content+=newl
  152. result=re.search(r'\[\["',content)
  153. print(result)
  154. content_begin=result.start()
  155. result=re.search(r'\]\]"',content)
  156. print(result)
  157. content_end=result.end()
  158. jscontent=content[content_begin:content_end-1]
  159. # write_to_file(jscontent,'c:/tmp/debug.txt')
  160. jsobj=json.loads(jscontent)
  161. for x in jsobj[0][1][1:]:
  162. print(x[14][11])
  163. print(x[14][9])
  164. reviews_cnt=None
  165. photo=None
  166. rating=None
  167. biz_id=None
  168. loc_x=None
  169. loc_y=None
  170. addr_elmts=None
  171. tel=None
  172. try:
  173. rating=x[14][4][7]
  174. reviews_cnt=x[14][4][8]
  175. except:
  176. traceback.print_exc()
  177. try:
  178. photo=x[14][37][0][0][0]
  179. num_photos=x[14][37][0][0][6][1]
  180. except:
  181. traceback.print_exc()
  182. try:
  183. loc_x=x[14][37][0][0][29][0]
  184. loc_y=x[14][37][0][0][29][1]
  185. except:
  186. traceback.print_exc()
  187. try:
  188. biz_id=x[14][57][2]
  189. tel=x[14][178][0][3]
  190. except:
  191. traceback.print_exc()
  192. try:
  193. addr_elmts=str(x[14][82])
  194. except:
  195. traceback.print_exc()
  196. category=str(x[14][13])
  197. topic=str(x[14][89])
  198. print(x[14][13])
  199. print(x[14][10])
  200. print(x[14][2])
  201. print(x[14][78])
  202. try:
  203. resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  204. except:
  205. traceback.print_exc()
  206. return resultobj
  207. def save_js_to_db(jsobj,num,keyword):
  208. global store_list_table
  209. global iddict
  210. for r in jsobj:
  211. if iddict.get(r['place_id']) is not None:
  212. continue
  213. r['num']=num
  214. r['keyword']=keyword
  215. try:
  216. store_list_table.insert(r)
  217. # store_list_table.upsert(r,keys=['place_id'])
  218. except:
  219. traceback.print_exc()
  220. # store_list_table.upsert(r,keys=['place_id'])
  221. def process_web_request(driver,area_num,keyword):
  222. # query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
  223. time.sleep(0.8)
  224. time.sleep(3)
  225. print("ppppppppp&**********************")
  226. for request in driver.requests:
  227. if 'search?' in request.url :
  228. print('searching.....')
  229. # else:
  230. # print(request.url[20:60])
  231. if request.response:
  232. # if 'https://www.google.com.tw/search?tbm=map' in request.url :
  233. if 'search?' in request.url :
  234. print('parsing js:')
  235. resp = brotli.decompress(request.response.body)
  236. jstext=resp.decode('utf-8')
  237. resultobj=parsing_js(jstext)
  238. print("before",datetime.now())
  239. save_js_to_db(resultobj,area_num,keyword)
  240. print("after",datetime.now())
  241. # time.sleep(9999)
  242. def main():
  243. global chrome_window
  244. global store_list_table
  245. global globalkw
  246. global proxyport
  247. global iddict
  248. if len(sys.argv)>1:
  249. globalkw=sys.argv[1]
  250. failcnt=0
  251. localip=socket.gethostbyname(socket.gethostname())
  252. # if localip=='192.168.1.108':
  253. # chrome_window=True
  254. # chrome_window=False
  255. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  256. iddict=build_cache(db)
  257. store_list_table = db['swire_store_list']
  258. # table2 = db['swire_progress_list']
  259. table2 = db['swire_area_progress']
  260. port=4444
  261. if len(sys.argv) == 3 :
  262. port=int(sys.argv[1])
  263. proxyport=int(sys.argv[2])
  264. if not chrome_window:
  265. print('restart docker p{}'.format(port))
  266. # os.system('sudo docker container restart p'+str(port))
  267. os.system('docker container restart p'+str(port))
  268. time.sleep(10)
  269. print('drvier start...')
  270. driver = brower_start(port)
  271. area_num=None
  272. while True:
  273. try:
  274. if len(sys.argv) > 3 :
  275. repkw=sys.argv[1]
  276. repnum=sys.argv[2]
  277. job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum)
  278. else:
  279. job=get_next_job(db)
  280. print(job)
  281. keyword = job['kw']
  282. latitude = job['lat'] #緯度
  283. longitude = job['lon'] #精度
  284. area_num=job['num']
  285. safe_string = urllib.parse.quote_plus(keyword)
  286. url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
  287. prev_cnt=0
  288. cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  289. for c in cursor:
  290. prev_cnt=c['cnt']
  291. break
  292. # url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
  293. # url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
  294. # print(url)
  295. # url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
  296. driver.get(url)
  297. # time.sleep(3)
  298. keyin_keyword(driver, keyword)
  299. process_web_request(driver,area_num,keyword)
  300. while True:
  301. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  302. if element.get_attribute('disabled'):
  303. break
  304. # driver.implicitly_wait(30)
  305. ActionChains(driver).move_to_element(element).click(element).perform()
  306. process_web_request(driver,area_num,keyword)
  307. # table2.upsert({'kw':keyword,'num':job['num']},['kw'])
  308. table2.insert({'kw':keyword,'num':job['num']},['kw'])
  309. db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
  310. except:
  311. traceback.print_exc()
  312. failcnt+=1
  313. if failcnt>=15:
  314. sys.exit()
  315. pass
  316. aft_cnt=0
  317. cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
  318. for c in cursor:
  319. aft_cnt=c['cnt']
  320. break
  321. db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
  322. if __name__ == '__main__':
  323. main()