shop_item_list.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. from datetime import datetime
  13. import pandas as pd
  14. import dataset
  15. import time
  16. import json
  17. import re
  18. import sys
  19. def brower_start(port):
  20. options = webdriver.ChromeOptions()
  21. browser = webdriver.Remote(
  22. #command_executor='http://192.53.174.202:4444/wd/hub',
  23. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  24. desired_capabilities=options.to_capabilities()
  25. )
  26. return browser
  27. def page_down_(driver, xpath_css, time_):
  28. elmts = driver.find_elements_by_xpath(xpath_css)
  29. print(elmts)
  30. if len(elmts)>1:
  31. elmt=elmts[1]
  32. else:
  33. elmt=elmts[0]
  34. actions = ActionChains(driver)
  35. actions.move_to_element(elmt).click().perform()
  36. for i in range(time_):
  37. try:
  38. actions = ActionChains(driver)
  39. actions.send_keys(Keys.PAGE_DOWN).perform()
  40. except:
  41. traceback.print_exc()
  42. time.sleep(0.5)
  43. def get_url_list(driver):
  44. # for i in range(5, 43, 2):
  45. # try:
  46. # wait = WebDriverWait(driver, 60)
  47. # wait.until(
  48. # EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
  49. # )
  50. # driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  51. # time.sleep(0.5)
  52. # except:
  53. # pass
  54. page_down_(driver, '//div[@class="TFQHme"]', 8)
  55. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  56. url_list = []
  57. for i in url_soup.find_all('a'):
  58. try:
  59. if i['href'].find('maps/place') != -1:
  60. url_list += [[i['href'], i['aria-label']]]
  61. except:
  62. pass
  63. print(len(url_list))
  64. return url_list
  65. def keyin_keyword(driver, keyword):
  66. button = driver.find_element_by_id("searchbox")
  67. driver.implicitly_wait(30)
  68. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  69. time.sleep(3)
  70. def get_crawler_list(db):
  71. result = db.query('select keyword, count(*) from shop_item_list group by keyword')
  72. result = pd.DataFrame([i for i in result])
  73. result.columns = ['keyword', 'count']
  74. results = results[results['count'] < 500]
  75. keyword = results.sample(1).iloc[0]['keyword']
  76. num=0
  77. cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
  78. for c in cursor:
  79. num=c['num']
  80. break
  81. cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  82. # cursor=db.query('select * from lat_lon_loc')
  83. lst=[]
  84. for c in cursor:
  85. lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  86. return keyword, lst
  87. def main():
  88. # data = pd.read_csv('lat_long_location.csv', index_col = 0)
  89. # keyword = '麻辣火鍋'
  90. port=4444
  91. if len(sys.argv) > 1 :
  92. port=sys.argv[1]
  93. # if len(sys.argv) >2:
  94. # port=int(sys.argv[2])
  95. print('drvier start...')
  96. driver = brower_start(port)
  97. # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  98. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  99. table=db['shop_item_list']
  100. table2=db['progress_list2']
  101. data, keyword = get_crawler_list(db)
  102. print( keyword, len(data))
  103. for row in data:
  104. # try:
  105. latitude = row['lat'] #緯度
  106. longitude = row['lon'] #精度
  107. table2.upsert({'kw':keyword,'num':r['num']},['kw'])
  108. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  109. driver.get(url)
  110. keyin_keyword(driver, keyword)
  111. for page in range(4):
  112. print(keyword, row['loc'], latitude, longitude, page)
  113. url_list = get_url_list(driver)
  114. shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  115. for item in url_list:
  116. try:
  117. table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  118. except:
  119. print('dup entry')
  120. # result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  121. # insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  122. # .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
  123. # DA.mysql_insert_data(db, insert_sql)
  124. if page < 2 :
  125. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  126. driver.implicitly_wait(30)
  127. ActionChains(driver).move_to_element(element).click(element).perform()
  128. # except:
  129. # error = pd.DataFrame([row])
  130. # error.to_csv('error_shop_item_list.csv', mode='a', header = False)
  131. #driver.close()
  132. #driver = brower_start()
  133. if __name__ == '__main__':
  134. main()