shop_item_list.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. from datetime import datetime
  13. import pandas as pd
  14. import time
  15. import json
  16. import re
  17. import sys
  18. def brower_start(port):
  19. options = webdriver.ChromeOptions()
  20. browser = webdriver.Remote(
  21. #command_executor='http://192.53.174.202:4444/wd/hub',
  22. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  23. desired_capabilities=options.to_capabilities()
  24. )
  25. return browser
  26. def page_down_(driver, xpath_css, time_):
  27. elmts = driver.find_elements_by_xpath(xpath_css)
  28. print(elmts)
  29. if len(elmts)>1:
  30. elmt=elmts[1]
  31. else:
  32. elmt=elmts[0]
  33. actions = ActionChains(driver)
  34. actions.move_to_element(elmt).click().perform()
  35. for i in range(time_):
  36. try:
  37. actions = ActionChains(driver)
  38. actions.send_keys(Keys.PAGE_DOWN).perform()
  39. except:
  40. traceback.print_exc()
  41. time.sleep(0.5)
  42. def get_url_list(driver):
  43. # for i in range(5, 43, 2):
  44. # try:
  45. # wait = WebDriverWait(driver, 60)
  46. # wait.until(
  47. # EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
  48. # )
  49. # driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  50. # time.sleep(0.5)
  51. # except:
  52. # pass
  53. page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 8)
  54. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  55. url_list = []
  56. for i in url_soup.find_all('a'):
  57. try:
  58. if i['href'].find('maps/place') != -1:
  59. url_list += [[i['href'], i['aria-label']]]
  60. except:
  61. pass
  62. print(len(url_list))
  63. return url_list
  64. def keyin_keyword(driver, keyword):
  65. button = driver.find_element_by_id("searchbox")
  66. driver.implicitly_wait(30)
  67. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  68. time.sleep(3)
  69. def main():
  70. data = pd.read_csv('lat_long_location.csv', index_col = 0)
  71. keyword = '麻辣火鍋'
  72. if len(sys.argv) >1:
  73. keyword=sys.argv[1]
  74. port=4444
  75. if len(sys.argv) >2:
  76. port=int(sys.argv[2])
  77. print('drvier start...')
  78. driver = brower_start(port)
  79. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  80. for k, row in data.iterrows():
  81. try:
  82. latitude = row['latitude'] #緯度
  83. longitude = row['longitude'] #精度
  84. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  85. driver.get(url)
  86. keyin_keyword(driver, keyword)
  87. for page in range(4):
  88. print(keyword, k, row['location'], latitude, longitude, page)
  89. url_list = get_url_list(driver)
  90. shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  91. for item in url_list:
  92. result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  93. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  94. .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
  95. DA.mysql_insert_data(db, insert_sql)
  96. if page < 2 :
  97. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  98. driver.implicitly_wait(30)
  99. ActionChains(driver).move_to_element(element).click(element).perform()
  100. except:
  101. error = pd.DataFrame([row])
  102. error.to_csv('error_shop_item_list.csv', mode='a', header = False)
  103. #driver.close()
  104. #driver = brower_start()
  105. if __name__ == '__main__':
  106. main()