shop_item_list.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. import selenium
  9. import traceback
  10. from bs4 import BeautifulSoup
  11. from utility import database_access as DA
  12. from utility.parseutils import *
  13. from utility.connect import *
  14. from datetime import datetime
  15. import pandas as pd
  16. import dataset
  17. import time
  18. import json
  19. import re
  20. import sys, os
  21. import socket
  22. chrome_window=False
  23. def brower_start(port):
  24. options = webdriver.ChromeOptions()
  25. if chrome_window:
  26. browser = webdriver.Chrome(
  27. desired_capabilities=options.to_capabilities()
  28. )
  29. else:
  30. browser = webdriver.Remote(
  31. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  32. desired_capabilities=options.to_capabilities()
  33. )
  34. return browser
  35. def page_down_(driver, xpath_css, time_):
  36. e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
  37. result_count = e.text.split('-')[1].replace(' 項結果','')
  38. print(result_count)
  39. if int(result_count) > 5:
  40. for i in range(time_):
  41. e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
  42. action = webdriver.common.action_chains.ActionChains(driver)
  43. action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
  44. action.click()
  45. action.perform()
  46. time.sleep(0.5)
  47. # elmts = driver.find_elements_by_xpath(xpath_css)
  48. # print(elmts)
  49. # if len(elmts)>1:
  50. # elmt=elmts[1]
  51. # else:
  52. # elmt=elmts[0]
  53. # actions = ActionChains(driver)
  54. # actions.move_to_element(elmt).click().perform()
  55. # for i in range(time_):
  56. # try:
  57. # actions = ActionChains(driver)
  58. # actions.send_keys(Keys.PAGE_DOWN).perform()
  59. # except:
  60. # traceback.print_exc()
  61. # time.sleep(0.5)
  62. def get_url_list(driver):
  63. # for i in range(5, 43, 2):
  64. # try:
  65. # wait = WebDriverWait(driver, 60)
  66. # wait.until(
  67. # EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
  68. # )
  69. # driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  70. # time.sleep(0.5)
  71. # except:
  72. # pass
  73. # wait = WebDriverWait(driver, 30)
  74. # try:
  75. # wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
  76. # except selenium.common.exceptions.TimeoutException:
  77. # traceback.print_exc()
  78. # return "EMPTY"
  79. page_down_(driver, '//div[@class="TFQHme"]', 8)
  80. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  81. url_list = []
  82. for i in url_soup.find_all('a'):
  83. try:
  84. if i['href'].find('maps/place') != -1:
  85. url_list += [[i['href'], i['aria-label']]]
  86. except:
  87. pass
  88. # print(len(url_list))
  89. return url_list
  90. def keyin_keyword(driver, keyword):
  91. button = driver.find_element_by_id("searchbox")
  92. driver.implicitly_wait(30)
  93. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  94. time.sleep(3)
  95. # def get_crawler_list(db):
  96. # result = db.query('select keyword, count(*) from shop_item_list group by keyword')
  97. # result = pd.DataFrame([i for i in result])
  98. # result.columns = ['keyword', 'count']
  99. # result = result[result['count'] < 100]
  100. # keyword = result.sample(1).iloc[0]['keyword']
  101. # num=0
  102. # cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
  103. # for c in cursor:
  104. # num=c['num']
  105. # break
  106. # cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  107. # # cursor=db.query('select * from lat_lon_loc')
  108. # lst=[]
  109. # for c in cursor:
  110. # lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  111. # return keyword, lst
  112. def get_crawler_list(db):
  113. result = db.query('select * from shop_item_list order by keyword')
  114. result = pd.DataFrame([i for i in result])
  115. result = result[~result.keyword.str.contains('項')]
  116. progress = db.query('select distinct(kw) from progress_list2 where num < 367')
  117. progress = pd.DataFrame([i for i in progress])
  118. if len(progress) != 0:
  119. keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
  120. else:
  121. keyword = result.iloc[0]['keyword']
  122. return keyword
  123. def get_lon_lat_list(db, keyword):
  124. num=0
  125. cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
  126. for c in cursor:
  127. num=c['num']
  128. break
  129. cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
  130. lst=[]
  131. for c in cursor:
  132. lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
  133. return lst
  134. def main():
  135. global chrome_window
  136. localip=socket.gethostbyname(socket.gethostname())
  137. if localip=='192.168.1.108':
  138. chrome_window=True
  139. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  140. table = db['shop_item_list3']
  141. table2 = db['progress_list2']
  142. port=4447
  143. if len(sys.argv) > 1 :
  144. port=int(sys.argv[1])
  145. print('restart docker p{}'.format(port))
  146. os.system('sudo docker container restart p'+str(port))
  147. time.sleep(8)
  148. print('drvier start...')
  149. driver = brower_start(port)
  150. for i in range(10):
  151. try:
  152. keyword = get_crawler_list(db)
  153. print(keyword)
  154. lst = get_lon_lat_list(db, keyword)
  155. print(keyword, len(lst))
  156. for r in lst:
  157. latitude = r['lat'] #緯度
  158. longitude = r['lon'] #精度
  159. area_num=r['num']
  160. table2.upsert({'kw':keyword,'num':r['num']},['kw'])
  161. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
  162. driver.get(url)
  163. keyin_keyword(driver, keyword)
  164. failcnt = 0
  165. for page in range(10):
  166. print(keyword, latitude, longitude, page)
  167. url_list = get_url_list(driver)
  168. duplicate = 0
  169. # shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
  170. for item in url_list:
  171. try:
  172. table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \
  173. 'keyword':keyword, 'item_url':item[0],'area_num':area_num,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  174. except:
  175. duplicate += 1
  176. print(len(url_list), duplicate)
  177. # result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
  178. # insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  179. # .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
  180. # DA.mysql_insert_data(db, insert_sql)
  181. if page < 2 :
  182. element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
  183. if element.get_attribute('disabled'):
  184. break
  185. driver.implicitly_wait(30)
  186. ActionChains(driver).move_to_element(element).click(element).perform()
  187. except:
  188. pass
  189. if __name__ == '__main__':
  190. main()