shop_item_list.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. from datetime import datetime
  13. import pandas as pd
  14. import time
  15. import json
  16. import re
  17. def brower_start(port):
  18. options = webdriver.ChromeOptions()
  19. browser = webdriver.Remote(
  20. #command_executor='http://192.53.174.202:4444/wd/hub',
  21. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  22. desired_capabilities=options.to_capabilities()
  23. )
  24. return browser
  25. def page_down_(driver, xpath_css, time_):
  26. elmts = driver.find_elements_by_xpath(xpath_css)
  27. print(elmts)
  28. if len(elmts)>1:
  29. elmt=elmts[1]
  30. else:
  31. elmt=elmts[0]
  32. actions = ActionChains(driver)
  33. actions.move_to_element(elmt).click().perform()
  34. for i in range(time_):
  35. try:
  36. actions = ActionChains(driver)
  37. actions.send_keys(Keys.PAGE_DOWN).perform()
  38. except:
  39. traceback.print_exc()
  40. time.sleep(0.5)
  41. def get_url_list(driver):
  42. # for i in range(5, 43, 2):
  43. # try:
  44. # wait = WebDriverWait(driver, 60)
  45. # wait.until(
  46. # EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
  47. # )
  48. # driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  49. # time.sleep(0.5)
  50. # except:
  51. # pass
  52. page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 8)
  53. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  54. url_list = []
  55. for i in url_soup.find_all('a'):
  56. try:
  57. if i['href'].find('maps/place') != -1:
  58. url_list += [[i['href'], i['aria-label']]]
  59. except:
  60. pass
  61. print(len(url_list))
  62. return url_list
  63. def keyin_keyword(driver, keyword):
  64. button = driver.find_element_by_id("searchbox")
  65. driver.implicitly_wait(30)
  66. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  67. time.sleep(3)
  68. def main():
  69. data = pd.read_csv('lat_long_location.csv', index_col = 0)
  70. keyword = '麻辣火鍋'
  71. if len(sys.argv) >1:
  72. keyword=sys.argv[1]
  73. port=4444
  74. if len(sys.argv) >2:
  75. port=int(sys.argv[2])
  76. print('drvier start...')
  77. driver = brower_start(port)
  78. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  79. for k, row in data.iterrows():
  80. try:
  81. latitude = row['latitude'] #緯度
  82. longitude = row['longitude'] #精度
  83. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  84. driver.get(url)
  85. keyin_keyword(driver, keyword)
  86. for page in range(4):
  87. print(keyword, k, row['location'], latitude, longitude, page)
  88. url_list = get_url_list(driver)
  89. shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  90. for item in url_list:
  91. result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  92. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  93. .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
  94. DA.mysql_insert_data(db, insert_sql)
  95. if page < 2 :
  96. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  97. driver.implicitly_wait(30)
  98. ActionChains(driver).move_to_element(element).click(element).perform()
  99. except:
  100. error = pd.DataFrame([row])
  101. error.to_csv('error_shop_item_list.csv', mode='a', header = False)
  102. #driver.close()
  103. #driver = brower_start()
  104. if __name__ == '__main__':
  105. main()