swire_shop_item_list.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from utility import database_access as DA
  13. from utility.parseutils import *
  14. from utility.connect import *
  15. from datetime import datetime
  16. import pandas as pd
  17. import dataset
  18. import time
  19. import json
  20. import re
  21. import sys, os
  22. import socket
  23. import brotli
  24. chrome_window=False
  25. def brower_start(port):
  26. options = webdriver.ChromeOptions()
  27. if chrome_window:
  28. browser = webdriver.Chrome(
  29. desired_capabilities=options.to_capabilities()
  30. )
  31. else:
  32. browser = webdriver.Remote(
  33. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  34. desired_capabilities=options.to_capabilities()
  35. )
  36. return browser
  37. def page_down_(driver, xpath_css, time_):
  38. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  39. result_count = e.text.split('-')[1].replace(' 項結果','')
  40. print(result_count)
  41. if int(result_count) > 5:
  42. for i in range(time_):
  43. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  44. action = webdriver.common.action_chains.ActionChains(driver)
  45. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  46. action.click()
  47. action.perform()
  48. time.sleep(0.5)
  49. def get_url_list(driver):
  50. page_down_(driver, '//div[@class="TFQHme"]', 8)
  51. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  52. url_list = []
  53. for i in url_soup.find_all('a'):
  54. try:
  55. if i['href'].find('maps/place') != -1:
  56. url_list += [[i['href'], i['aria-label']]]
  57. except:
  58. pass
  59. # print(len(url_list))
  60. return url_list
  61. def keyin_keyword(driver, keyword):
  62. button = driver.find_element_by_id("searchbox")
  63. driver.implicitly_wait(30)
  64. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  65. time.sleep(3)
  66. def get_crawler_list(db):
  67. # result = db.query('select * from shop_item_list order by keyword')
  68. # result = pd.DataFrame([i for i in result])
  69. # result = result[~result.keyword.str.contains('項')]
  70. # progress = db.query('select distinct(kw) from progress_list2 where num < 367')
  71. # progress = pd.DataFrame([i for i in progress])
  72. # if len(progress) != 0:
  73. # keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
  74. # else:
  75. # keyword = result.iloc[0]['keyword']
  76. #
  77. # return keyword
  78. return '滷味'
  79. cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
  80. for c in cursor:
  81. return c['kw']
  82. return None
  83. def get_lon_lat_list(db, keyword):
  84. num=0
  85. cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
  86. for c in cursor:
  87. num=c['num']
  88. break
  89. cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  90. lst=[]
  91. for c in cursor:
  92. lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  93. return lst
  94. def write_to_file(jsobj,fname):
  95. import codecs
  96. fw=codecs.open(fname,'w','utf-8')
  97. fw.write(str(jsobj))
  98. fw.close()
  99. def parsing_js(orig):
  100. content=""
  101. lines=orig.split('\n')
  102. for l in lines:
  103. newl=l.replace('\\"','"')
  104. # if '\\\\"' in newl:
  105. # print(newl)
  106. # newl=newl.repace('\\\\"','')
  107. newl=newl.replace('\\"','"')
  108. content+=newl
  109. result=re.search(r'\[\["',content)
  110. print(result)
  111. content_begin=result.start()
  112. result=re.search(r'\]\]"',content)
  113. print(result)
  114. content_end=result.end()
  115. jscontent=content[content_begin:content_end-1]
  116. write_to_file(jscontent,'c:/tmp/debug.txt')
  117. jsobj=json.loads(jscontent)
  118. for x in jsobj[0][1][1:]:
  119. print(x[14][11])
  120. print(x[14][10])
  121. print(x[14][2])
  122. print(x[14][78])
  123. def main():
  124. global chrome_window
  125. localip=socket.gethostbyname(socket.gethostname())
  126. if localip=='192.168.1.108':
  127. chrome_window=True
  128. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  129. table = db['shop_item_list3']
  130. table2 = db['progress_list2']
  131. port=4447
  132. if len(sys.argv) > 1 :
  133. port=int(sys.argv[1])
  134. print('restart docker p{}'.format(port))
  135. os.system('sudo docker container restart p'+str(port))
  136. time.sleep(8)
  137. print('drvier start...')
  138. driver = brower_start(port)
  139. for i in range(10):
  140. try:
  141. keyword = get_crawler_list(db)
  142. print(keyword)
  143. lst = get_lon_lat_list(db, keyword)
  144. # print(lst)
  145. print(keyword, len(lst))
  146. for r in lst:
  147. latitude = r['lat'] #緯度
  148. longitude = r['lon'] #精度
  149. area_num=r['num']
  150. table2.upsert({'kw':keyword,'num':r['num']},['kw'])
  151. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  152. driver.get(url)
  153. keyin_keyword(driver, keyword)
  154. failcnt = 0
  155. # query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
  156. time.sleep(11)
  157. print("ppppppppp&**********************")
  158. for request in driver.requests:
  159. if request.response:
  160. if 'https://www.google.com.tw/search?tbm=map' in request.url :
  161. print('parsing js:')
  162. resp = brotli.decompress(request.response.body)
  163. jstext=resp.decode('utf-8')
  164. parsing_js(jstext)
  165. for page in range(10):
  166. if page < 2 :
  167. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  168. if element.get_attribute('disabled'):
  169. break
  170. driver.implicitly_wait(30)
  171. ActionChains(driver).move_to_element(element).click(element).perform()
  172. except:
  173. pass
  174. if __name__ == '__main__':
  175. main()