swire_shop_item_list.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from utility import database_access as DA
  13. from utility.parseutils import *
  14. from utility.connect import *
  15. from datetime import datetime
  16. import pandas as pd
  17. import dataset
  18. import time
  19. import json
  20. import re
  21. import sys, os
  22. import socket
  23. import brotli
  24. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  25. chrome_window=False
  26. def brower_start(port):
  27. options = webdriver.ChromeOptions()
  28. if chrome_window:
  29. browser = webdriver.Chrome(
  30. desired_capabilities=options.to_capabilities()
  31. )
  32. else:
  33. chrome_options = webdriver.ChromeOptions()
  34. chrome_options.add_argument('--proxy-server=host.docker.internal:8787') # Specify your Kubernetes service-name here
  35. chrome_options.add_argument('--ignore-certificate-errors')
  36. chrome_options.add_argument("--no-sandbox")
  37. chrome_options.add_argument("--disable-dev-shm-usage")
  38. browser = webdriver.Remote(
  39. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  40. desired_capabilities=chrome_options.to_capabilities(),
  41. seleniumwire_options={'addr':'0.0.0.0','port':8787,'auto_config': False}
  42. )
  43. # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
  44. browser.set_window_size(1400,1000)
  45. return browser
  46. def page_down_(driver, xpath_css, time_):
  47. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  48. result_count = e.text.split('-')[1].replace(' 項結果','')
  49. print(result_count)
  50. if int(result_count) > 5:
  51. for i in range(time_):
  52. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  53. action = webdriver.common.action_chains.ActionChains(driver)
  54. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  55. action.click()
  56. action.perform()
  57. time.sleep(0.5)
  58. def get_url_list(driver):
  59. page_down_(driver, '//div[@class="TFQHme"]', 8)
  60. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  61. url_list = []
  62. for i in url_soup.find_all('a'):
  63. try:
  64. if i['href'].find('maps/place') != -1:
  65. url_list += [[i['href'], i['aria-label']]]
  66. except:
  67. pass
  68. # print(len(url_list))
  69. return url_list
  70. def keyin_keyword(driver, keyword):
  71. button = driver.find_element_by_id("searchbox")
  72. driver.implicitly_wait(30)
  73. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  74. time.sleep(3)
  75. def get_next_job(db):
  76. result={}
  77. cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')
  78. for c in cursor:
  79. result['kw']=c['kw']
  80. result['num']=c['num']
  81. break
  82. cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
  83. for c in cursor:
  84. result['lat']=c['lat']
  85. result['lon']=c['lon']
  86. result['loc']=c['loc']
  87. break
  88. return result
  89. def write_to_file(jsobj,fname):
  90. import codecs
  91. fw=codecs.open(fname,'w','utf-8')
  92. fw.write(str(jsobj))
  93. fw.close()
  94. def parsing_js(orig):
  95. resultobj=[]
  96. content=""
  97. lines=orig.split('\n')
  98. for l in lines:
  99. newl=l.replace('\\"','"')
  100. # if '\\\\"' in newl:
  101. # print(newl)
  102. # newl=newl.repace('\\\\"','')
  103. newl=newl.replace('\\"','"')
  104. content+=newl
  105. result=re.search(r'\[\["',content)
  106. print(result)
  107. content_begin=result.start()
  108. result=re.search(r'\]\]"',content)
  109. print(result)
  110. content_end=result.end()
  111. jscontent=content[content_begin:content_end-1]
  112. # write_to_file(jscontent,'c:/tmp/debug.txt')
  113. jsobj=json.loads(jscontent)
  114. for x in jsobj[0][1][1:]:
  115. print(x[14][11])
  116. print(x[14][10])
  117. print(x[14][2])
  118. print(x[14][78])
  119. try:
  120. resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'place_id':x[14][78],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  121. except:
  122. traceback.print_exc()
  123. return resultobj
  124. def save_js_to_db(jsobj,num,keyword):
  125. global store_list_table
  126. for r in jsobj:
  127. r['num']=num
  128. r['keyword']=keyword
  129. store_list_table.upsert(r,keys=['place_id'])
  130. def process_web_request(driver,area_num,keyword):
  131. query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
  132. time.sleep(8)
  133. print("ppppppppp&**********************")
  134. for request in driver.requests:
  135. if request.response:
  136. if 'https://www.google.com.tw/search?tbm=map' in request.url :
  137. print('parsing js:')
  138. resp = brotli.decompress(request.response.body)
  139. jstext=resp.decode('utf-8')
  140. resultobj=parsing_js(jstext)
  141. save_js_to_db(resultobj,area_num,keyword)
  142. def main():
  143. global chrome_window
  144. global store_list_table
  145. failcnt=0
  146. localip=socket.gethostbyname(socket.gethostname())
  147. if localip=='192.168.1.108':
  148. # chrome_window=True
  149. chrome_window=False
  150. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  151. store_list_table = db['swire_store_list']
  152. table2 = db['swire_progress_list']
  153. port=4444
  154. # if len(sys.argv) > 1 :
  155. # port=int(sys.argv[1])
  156. if True:
  157. print('restart docker p{}'.format(port))
  158. # os.system('sudo docker container restart p'+str(port))
  159. os.system('docker container restart p'+str(port))
  160. time.sleep(10)
  161. print('drvier start...')
  162. driver = brower_start(port)
  163. while True:
  164. try:
  165. job=get_next_job(db)
  166. print(job)
  167. keyword = job['kw']
  168. latitude = job['lat'] #緯度
  169. longitude = job['lon'] #精度
  170. area_num=job['num']
  171. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  172. driver.get(url)
  173. keyin_keyword(driver, keyword)
  174. process_web_request(driver,area_num,keyword)
  175. while True:
  176. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  177. if element.get_attribute('disabled'):
  178. break
  179. # driver.implicitly_wait(30)
  180. ActionChains(driver).move_to_element(element).click(element).perform()
  181. process_web_request(driver,area_num,keyword)
  182. table2.upsert({'kw':keyword,'num':job['num']},['kw'])
  183. except:
  184. traceback.print_exc()
  185. failcnt+=1
  186. if failcnt>=15:
  187. sys.exit()
  188. pass
  189. if __name__ == '__main__':
  190. main()