jared_shop_item_list.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. import dataset
  13. from datetime import datetime
  14. import pandas as pd
  15. import time
  16. import json
  17. import re
  18. def brower_start():
  19. options = webdriver.ChromeOptions()
  20. browser = webdriver.Chrome(options=options)
  21. # browser = webdriver.Remote(
  22. # command_executor='http://192.53.174.202:4444/wd/hub',
  23. # desired_capabilities=options.to_capabilities()
  24. # )
  25. return browser
  26. def get_url_list(driver):
  27. for i in range(5, 43, 2):
  28. try:
  29. wait = WebDriverWait(driver, 60)
  30. wait.until(
  31. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
  32. )
  33. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  34. time.sleep(1)
  35. except:
  36. pass
  37. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  38. url_list = []
  39. for i in url_soup.find_all('a'):
  40. try:
  41. if i['href'].find('maps/place') != -1:
  42. url_list += [[i['href'], i['aria-label']]]
  43. except:
  44. pass
  45. return url_list
  46. def keyin_keyword(driver, keyword):
  47. button = driver.find_element_by_id("searchbox")
  48. driver.implicitly_wait(30)
  49. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  50. time.sleep(3)
  51. def main():
  52. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  53. print('drvier start...')
  54. driver = brower_start()
  55. keyword = '滷肉飯'
  56. num=-1
  57. cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
  58. for c in cursor:
  59. num=c['num']
  60. break
  61. table2=db['progress_list']
  62. cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  63. # cursor=db.query('select * from lat_lon_loc')
  64. lst=[]
  65. for c in cursor:
  66. lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  67. for r in lst:
  68. latitude = r['lat'] #緯度
  69. longitude = r['lon'] #精度
  70. table2.upsert({'kw':keyword,'num':r['num']},['kw'])
  71. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  72. driver.get(url)
  73. keyin_keyword(driver, keyword)
  74. for page in range(4):
  75. print( r['loc'], latitude, longitude, page)
  76. url_list = get_url_list(driver)
  77. print(url_list)
  78. shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  79. for item in url_list:
  80. result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  81. print(result)
  82. # insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  83. # .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
  84. # DA.mysql_insert_data(db, insert_sql)
  85. if page < 2 :
  86. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  87. driver.implicitly_wait(30)
  88. ActionChains(driver).move_to_element(element).click(element).perform()
  89. if __name__ == '__main__':
  90. main()