jared_pureselenium_shop_item_list.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. import dataset
  13. import sys
  14. from datetime import datetime
  15. import pandas as pd
  16. import time
  17. import traceback
  18. import json
  19. import re
  20. import os
  21. import selenium
  22. def brower_start(port):
  23. options = webdriver.ChromeOptions()
  24. # browser = webdriver.Chrome(options=options)
  25. # 上面成功再來用docker
  26. browser = webdriver.Remote(
  27. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  28. desired_capabilities=options.to_capabilities()
  29. )
  30. return browser
  31. def get_url_list(driver):
  32. wait = WebDriverWait(driver, 30)
  33. try:
  34. wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
  35. except selenium.common.exceptions.TimeoutException:
  36. traceback.print_exc()
  37. return "EMPTY"
  38. # elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]")
  39. elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']")
  40. print(elmts)
  41. if len(elmts)>1:
  42. elmt=elmts[1]
  43. else:
  44. elmt=elmts[0]
  45. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  46. for i in range(8):
  47. try:
  48. # print(elmt)
  49. # print('before send key')
  50. elmt.send_keys(Keys.PAGE_DOWN)
  51. except:
  52. # print('exception')
  53. traceback.print_exc()
  54. # print('after send key')
  55. time.sleep(0.5)
  56. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  57. url_list = []
  58. for i in url_soup.find_all('a'):
  59. try:
  60. if i['href'].find('maps/place') != -1:
  61. url_list += [[i['href'], i['aria-label']]]
  62. except:
  63. pass
  64. return url_list
  65. def keyin_keyword(driver, keyword):
  66. button = driver.find_element_by_id("searchbox")
  67. driver.implicitly_wait(30)
  68. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  69. time.sleep(3)
  70. def main():
  71. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  72. table=db['shop_item_list']
  73. keyword = '虱目魚'
  74. if len(sys.argv) >1:
  75. keyword=sys.argv[1]
  76. port=4444
  77. if len(sys.argv) >2:
  78. port=int(sys.argv[2])
  79. os.system('docker container restart p'+str(port))
  80. time.sleep(8)
  81. print('drvier start...')
  82. driver = brower_start(port)
  83. num=0
  84. cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
  85. for c in cursor:
  86. num=c['num']
  87. break
  88. table2=db['progress_list']
  89. cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  90. # cursor=db.query('select * from lat_lon_loc')
  91. lst=[]
  92. for c in cursor:
  93. lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  94. for r in lst:
  95. latitude = r['lat'] #緯度
  96. longitude = r['lon'] #精度
  97. table2.upsert({'kw':keyword,'num':r['num']},['kw'])
  98. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  99. driver.get(url)
  100. keyin_keyword(driver, keyword)
  101. failcnt=0
  102. for page in range(4):
  103. print( r['loc'], latitude, longitude, page)
  104. url_list = get_url_list(driver)
  105. if url_list == 'EMPTY':
  106. failcnt+=1
  107. if failcnt >=2:
  108. break
  109. continue
  110. print(url_list)
  111. shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  112. for item in url_list:
  113. try:
  114. table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  115. except:
  116. print('dup entry')
  117. # result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  118. # print(result)
  119. if page < 2 :
  120. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  121. driver.implicitly_wait(30)
  122. ActionChains(driver).move_to_element(element).click(element).perform()
  123. if __name__ == '__main__':
  124. main()