shop_item_list.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from utility import database_access as DA
  13. from utility.parseutils import *
  14. from utility.connect import *
  15. from datetime import datetime
  16. import pandas as pd
  17. import dataset
  18. import time
  19. import json
  20. import re
  21. import sys, os
  22. import socket
  23. import brotli
  24. chrome_window=False
  25. def brower_start(port):
  26. options = webdriver.ChromeOptions()
  27. if chrome_window:
  28. browser = webdriver.Chrome(
  29. desired_capabilities=options.to_capabilities()
  30. )
  31. else:
  32. browser = webdriver.Remote(
  33. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  34. desired_capabilities=options.to_capabilities()
  35. )
  36. return browser
  37. def page_down_(driver, xpath_css, time_):
  38. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  39. result_count = e.text.split('-')[1].replace(' 項結果','')
  40. print(result_count)
  41. if int(result_count) > 5:
  42. for i in range(time_):
  43. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  44. action = webdriver.common.action_chains.ActionChains(driver)
  45. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  46. action.click()
  47. action.perform()
  48. time.sleep(0.5)
  49. # elmts = driver.find_elements_by_xpath(xpath_css)
  50. # print(elmts)
  51. # if len(elmts)>1:
  52. # elmt=elmts[1]
  53. # else:
  54. # elmt=elmts[0]
  55. # actions = ActionChains(driver)
  56. # actions.move_to_element(elmt).click().perform()
  57. # for i in range(time_):
  58. # try:
  59. # actions = ActionChains(driver)
  60. # actions.send_keys(Keys.PAGE_DOWN).perform()
  61. # except:
  62. # traceback.print_exc()
  63. # time.sleep(0.5)
  64. def get_url_list(driver):
  65. # for i in range(5, 43, 2):
  66. # try:
  67. # wait = WebDriverWait(driver, 60)
  68. # wait.until(
  69. # EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
  70. # )
  71. # driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  72. # time.sleep(0.5)
  73. # except:
  74. # pass
  75. # wait = WebDriverWait(driver, 30)
  76. # try:地圖移動時更新結果
  77. # wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
  78. # except selenium.common.exceptions.TimeoutException:
  79. # traceback.print_exc()
  80. # return "EMPTY"
  81. page_down_(driver, '//div[@class="TFQHme"]', 8)
  82. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  83. url_list = []
  84. for i in url_soup.find_all('a'):
  85. try:
  86. if i['href'].find('maps/place') != -1:
  87. url_list += [[i['href'], i['aria-label']]]
  88. except:
  89. pass
  90. # print(len(url_list))
  91. return url_list
  92. def keyin_keyword(driver, keyword):
  93. button = driver.find_element_by_id("searchbox")
  94. driver.implicitly_wait(30)
  95. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  96. time.sleep(3)
  97. # def get_crawler_list(db):
  98. # result = db.query('select keyword, count(*) from shop_item_list group by keyword')
  99. # result = pd.DataFrame([i for i in result])
  100. # result.columns = ['keyword', 'count']
  101. # result = result[result['count'] < 100]
  102. # keyword = result.sample(1).iloc[0]['keyword']
  103. # num=0
  104. # cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
  105. # for c in cursor:
  106. # num=c['num']
  107. # break
  108. # cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  109. # # cursor=db.query('select * from lat_lon_loc')
  110. # lst=[]
  111. # for c in cursor:
  112. # lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  113. # return keyword, lst
  114. def get_crawler_list(db):
  115. # result = db.query('select * from shop_item_list order by keyword')
  116. # result = pd.DataFrame([i for i in result])
  117. # result = result[~result.keyword.str.contains('項')]
  118. # progress = db.query('select distinct(kw) from progress_list2 where num < 367')
  119. # progress = pd.DataFrame([i for i in progress])
  120. # if len(progress) != 0:
  121. # keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
  122. # else:
  123. # keyword = result.iloc[0]['keyword']
  124. #
  125. # return keyword
  126. return '滷味'
  127. cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
  128. for c in cursor:
  129. return c['kw']
  130. return None
  131. def get_lon_lat_list(db, keyword):
  132. num=0
  133. cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
  134. for c in cursor:
  135. num=c['num']
  136. break
  137. cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  138. lst=[]
  139. for c in cursor:
  140. lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  141. return lst
  142. def parsing_js(orig):
  143. content=""
  144. lines=orig.split('\n')
  145. for l in lines:
  146. newl=l.replace('\\"','"')
  147. content+=newl
  148. result=re.search(r'\[\["',content)
  149. content_begin=result.start()
  150. result=re.search(r'\]\]"',content)
  151. content_end=result.end()
  152. jscontent=content[content_begin:content_end-1]
  153. jsobj=json.loads(jscontent)
  154. print()
  155. for x in jsobj[0][1][1:]:
  156. print(x[14][11])
  157. print(x[14][10])
  158. print(x[14][2])
  159. print(x[14][78])
  160. def main():
  161. global chrome_window
  162. localip=socket.gethostbyname(socket.gethostname())
  163. if localip=='192.168.1.108':
  164. chrome_window=True
  165. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  166. table = db['shop_item_list3']
  167. table2 = db['progress_list2']
  168. port=4447
  169. if len(sys.argv) > 1 :
  170. port=int(sys.argv[1])
  171. print('restart docker p{}'.format(port))
  172. os.system('sudo docker container restart p'+str(port))
  173. time.sleep(8)
  174. print('drvier start...')
  175. driver = brower_start(port)
  176. for i in range(10):
  177. try:
  178. keyword = get_crawler_list(db)
  179. print(keyword)
  180. lst = get_lon_lat_list(db, keyword)
  181. # print(lst)
  182. print(keyword, len(lst))
  183. for r in lst:
  184. latitude = r['lat'] #緯度
  185. longitude = r['lon'] #精度
  186. area_num=r['num']
  187. table2.upsert({'kw':keyword,'num':r['num']},['kw'])
  188. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  189. driver.get(url)
  190. keyin_keyword(driver, keyword)
  191. failcnt = 0
  192. time.sleep(10)
  193. for request in driver.requests:
  194. if request.response:
  195. if 'https://www.google.com.tw/search?tbm=map' in request.url :
  196. print(
  197. request.url,
  198. request.response.status_code,
  199. request.response.headers['Content-Type']
  200. )
  201. print('parsing js:')
  202. resp = brotli.decompress(request.response.body)
  203. jstext=resp.decode('utf-8')
  204. parsing_js(jstext)
  205. # import codecs
  206. # fw=codecs.open('c:/tmp/ot.json','w','utf-8')
  207. # fw.write(jstext)
  208. # fw.close()
  209. # print(jstext)
  210. # time.sleep(9999)
  211. # jsobj=json.loads(jstext)
  212. # print(jsobj)
  213. # sys.exit()
  214. for page in range(10):
  215. print(keyword, latitude, longitude, page)
  216. url_list = get_url_list(driver)
  217. duplicate = 0
  218. # shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  219. for item in url_list:
  220. try:
  221. table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \
  222. 'keyword':keyword, 'item_url':item[0],'area_num':area_num,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  223. except:
  224. duplicate += 1
  225. print(len(url_list), duplicate)
  226. # result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  227. # insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  228. # .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
  229. # DA.mysql_insert_data(db, insert_sql)
  230. if page < 2 :
  231. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  232. if element.get_attribute('disabled'):
  233. break
  234. driver.implicitly_wait(30)
  235. ActionChains(driver).move_to_element(element).click(element).perform()
  236. except:
  237. pass
  238. if __name__ == '__main__':
  239. main()