shop_item_list.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. from datetime import datetime
  13. import pandas as pd
  14. import time
  15. import json
  16. import re
  17. def brower_start():
  18. options = webdriver.ChromeOptions()
  19. browser = webdriver.Remote(
  20. command_executor='http://192.53.174.202:4444/wd/hub',
  21. desired_capabilities=options.to_capabilities()
  22. )
  23. return browser
  24. def get_url_list(driver):
  25. for i in range(5, 43, 2):
  26. try:
  27. wait = WebDriverWait(driver, 60)
  28. wait.until(
  29. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
  30. )
  31. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  32. time.sleep(1)
  33. except:
  34. pass
  35. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  36. url_list = []
  37. for i in url_soup.find_all('a'):
  38. try:
  39. if i['href'].find('maps/place') != -1:
  40. url_list += [[i['href'], i['aria-label']]]
  41. except:
  42. pass
  43. return url_list
  44. def keyin_keyword(driver, keyword):
  45. button = driver.find_element_by_id("searchbox")
  46. driver.implicitly_wait(30)
  47. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  48. time.sleep(3)
  49. def main():
  50. data = pd.read_csv('lat_long_location.csv', index_col = 0)
  51. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  52. print('drvier start...')
  53. driver = brower_start()
  54. for k, row in data.iterrows():
  55. # if k < 297:continue
  56. latitude = row['latitude'] #緯度
  57. longitude = row['longitude'] #精度
  58. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  59. driver.get(url)
  60. keyword = '火鍋'
  61. keyin_keyword(driver, keyword)
  62. for page in range(4):
  63. print(k, row['location'], latitude, longitude, page)
  64. url_list = get_url_list(driver)
  65. shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  66. for item in url_list:
  67. result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  68. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  69. .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
  70. DA.mysql_insert_data(db, insert_sql)
  71. if page < 2 :
  72. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  73. driver.implicitly_wait(30)
  74. ActionChains(driver).move_to_element(element).click(element).perform()
  75. if __name__ == '__main__':
  76. main()