jared_pureselenium_shop_item_list.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. import dataset
  13. import sys
  14. from datetime import datetime
  15. import pandas as pd
  16. import time
  17. import traceback
  18. import json
  19. import re
  20. def brower_start(port):
  21. options = webdriver.ChromeOptions()
  22. browser = webdriver.Chrome(options=options)
  23. # 上面成功再來用docker
  24. # browser = webdriver.Remote(
  25. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  26. # desired_capabilities=options.to_capabilities()
  27. # )
  28. return browser
  29. def get_url_list(driver):
  30. wait = WebDriverWait(driver, 60)
  31. wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
  32. # elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]")
  33. elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']")
  34. print(elmts)
  35. if len(elmts)>1:
  36. elmt=elmts[1]
  37. else:
  38. elmt=elmts[0]
  39. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  40. for i in range(8):
  41. try:
  42. # print(elmt)
  43. # print('before send key')
  44. elmt.send_keys(Keys.PAGE_DOWN)
  45. except:
  46. # print('exception')
  47. traceback.print_exc()
  48. # print('after send key')
  49. time.sleep(0.5)
  50. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  51. url_list = []
  52. for i in url_soup.find_all('a'):
  53. try:
  54. if i['href'].find('maps/place') != -1:
  55. url_list += [[i['href'], i['aria-label']]]
  56. except:
  57. pass
  58. return url_list
  59. def keyin_keyword(driver, keyword):
  60. button = driver.find_element_by_id("searchbox")
  61. driver.implicitly_wait(30)
  62. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  63. time.sleep(3)
  64. def main():
  65. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  66. keyword = '虱目魚'
  67. if len(sys.argv) >1:
  68. keyword=sys.argv[1]
  69. port=4444
  70. if len(sys.argv) >2:
  71. port=int(sys.argv[2])
  72. print('drvier start...')
  73. driver = brower_start(port)
  74. num=0
  75. cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
  76. for c in cursor:
  77. num=c['num']
  78. break
  79. table2=db['progress_list']
  80. cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  81. # cursor=db.query('select * from lat_lon_loc')
  82. lst=[]
  83. for c in cursor:
  84. lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  85. for r in lst:
  86. latitude = r['lat'] #緯度
  87. longitude = r['lon'] #精度
  88. table2.upsert({'kw':keyword,'num':r['num']},['kw'])
  89. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  90. driver.get(url)
  91. keyin_keyword(driver, keyword)
  92. for page in range(4):
  93. print( r['loc'], latitude, longitude, page)
  94. url_list = get_url_list(driver)
  95. print(url_list)
  96. shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  97. for item in url_list:
  98. result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  99. print(result)
  100. if page < 2 :
  101. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  102. driver.implicitly_wait(30)
  103. ActionChains(driver).move_to_element(element).click(element).perform()
  104. if __name__ == '__main__':
  105. main()