swire_shop_review.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from utility import database_access as DA
  13. from utility.parseutils import *
  14. from utility.connect import *
  15. from datetime import datetime
  16. from requests import session
  17. import pandas as pd
  18. import dataset
  19. import time
  20. import json
  21. import re
  22. import sys, os
  23. import socket
  24. import brotli
  25. import pickle
  26. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  27. import urllib.parse
  28. chrome_window=False
  29. globalkw=None
  30. proxyport=8787
  31. db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
  32. 'review_time', 'review_content', 'review_image',
  33. 'store_review_time','store_review']
  34. def write_to_file(jsobj,fname):
  35. with open(fname, 'wb') as handle:
  36. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  37. # import codecs
  38. # fw=codecs.open(fname,'w','utf-8')
  39. # fw.write(str(jsobj))
  40. # fw.close()
  41. def build_cache(db):
  42. global reviews_table
  43. id_dict={}
  44. cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
  45. for c in cursor:
  46. key = '{}_{}'.format(c['fid'],c['author_id'])
  47. id_dict[key]=1
  48. return id_dict
  49. def brower_start(port):
  50. global proxyport
  51. global chrome_window
  52. print(proxyport)
  53. options = webdriver.ChromeOptions()
  54. if chrome_window:
  55. browser = webdriver.Chrome(
  56. desired_capabilities=options.to_capabilities()
  57. )
  58. else:
  59. chrome_options = webdriver.ChromeOptions()
  60. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  61. chrome_options.add_argument('--ignore-certificate-errors')
  62. chrome_options.add_argument("--no-sandbox")
  63. chrome_options.add_argument("--disable-dev-shm-usage")
  64. browser = webdriver.Remote(
  65. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  66. desired_capabilities=chrome_options.to_capabilities(),
  67. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  68. )
  69. browser.set_window_size(1400,1000)
  70. return browser
  71. def get_next_job(db):
  72. result = {}
  73. result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')
  74. url_pd = pd.DataFrame([dict(i) for i in result])
  75. url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  76. remove = db.query('select fid from review_process')
  77. remove = pd.DataFrame([dict(i) for i in remove])
  78. remove_fid_list = remove['fid'].to_list()
  79. url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
  80. return url_pd
  81. def parsing_js(resp):
  82. jsobj = json.loads(resp[5::])
  83. result = []
  84. for i in range(len(jsobj[2])):
  85. tmp = []
  86. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  87. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  88. # image
  89. image = []
  90. if jsobj[2][i][14]:
  91. for j in range(len(jsobj[2][i][14])):
  92. image += [jsobj[2][i][14][j][6][0]]
  93. tmp += [image]
  94. # store reply
  95. if jsobj[2][i][9]:
  96. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  97. else:
  98. tmp += ['', '']
  99. tmp_dict = {}
  100. for i in range(len(db_columns)):
  101. tmp_dict[db_columns[i]] = tmp[i]
  102. tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  103. result.append(tmp_dict)
  104. # write_to_file(orig,'debug.pickle')
  105. return result
  106. def save_js_to_db(jsobj, fid):
  107. global reviews_table
  108. global iddict
  109. for r in jsobj:
  110. r['fid'] = fid
  111. key = '{}_{}'.format(r['fid'], r['author_id'])
  112. if iddict.get(key) is not None:
  113. continue
  114. try:
  115. r['review_image'] = str(r['review_image'])
  116. reviews_table.insert(r)
  117. except:
  118. traceback.print_exc()
  119. def process_web_request(db, driver, fid):
  120. time.sleep(0.8)
  121. time.sleep(3)
  122. print("ppppppppp&**********************")
  123. for request in driver.requests:
  124. if request.response:
  125. # print(request.url)
  126. if 'listentitiesreviews?' in request.url :
  127. print('parsing js:')
  128. print(request.url)
  129. resp = brotli.decompress(request.response.body)
  130. jstext = resp.decode('utf-8')
  131. result = parsing_js(jstext)
  132. save_js_to_db(result, fid)
  133. time.sleep(1)
  134. def page_down_(driver, xpath_css, time_):
  135. elmts = driver.find_elements_by_xpath(xpath_css)
  136. print(elmts)
  137. if len(elmts)>1:
  138. elmt=elmts[1]
  139. else:
  140. elmt=elmts[0]
  141. actions = ActionChains(driver)
  142. actions.move_to_element(elmt).click().perform()
  143. for i in range(time_):
  144. try:
  145. actions = ActionChains(driver)
  146. actions.send_keys(Keys.PAGE_DOWN).perform()
  147. except:
  148. traceback.print_exc()
  149. time.sleep(0.5)
  150. def get_reviews(driver, reviews_cnt):
  151. wait = WebDriverWait(driver, 30)
  152. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  153. wait.until(
  154. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  155. )
  156. element = driver.find_element_by_css_selector(more_reviews_css)
  157. driver.implicitly_wait(10)
  158. ActionChains(driver).move_to_element(element).click(element).perform()
  159. time.sleep(0.5)
  160. reviews_cnt = int(reviews_cnt)
  161. if reviews_cnt > 10:
  162. page_down_count = int(reviews_cnt) // 3
  163. page_down_(driver, '//div[@class="PPCwl"]', page_down_count)
  164. def main():
  165. global chrome_window
  166. global store_list_table
  167. global reviews_table
  168. global proxyport
  169. global iddict
  170. localip=socket.gethostbyname(socket.gethostname())
  171. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  172. store_list_table = db['swire_store_list']
  173. reviews_table = db['reviews_table']
  174. iddict=build_cache(db)
  175. port=4444
  176. if len(sys.argv) == 3 :
  177. port=int(sys.argv[1])
  178. proxyport=int(sys.argv[2])
  179. if not chrome_window:
  180. print('restart docker pw{}'.format(port))
  181. # os.system('sudo docker container restart p'+str(port))
  182. os.system('sudo docker container restart pw'+str(port))
  183. time.sleep(10)
  184. print('drvier start...')
  185. driver = brower_start(port)
  186. job = get_next_job(db)
  187. for row, group in job.iterrows():
  188. try:
  189. item_url = group['item_url']
  190. reviews_cnt = group['reviews_cnt']
  191. fid = group['fid']
  192. print(reviews_cnt, item_url)
  193. driver.get(item_url)
  194. time.sleep(0.5)
  195. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  196. tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})
  197. if tmp_value:
  198. get_reviews(driver, reviews_cnt)
  199. process_web_request(db, driver, fid)
  200. print(driver.current_url)
  201. db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
  202. except:
  203. traceback.print_exc()
  204. if __name__ == '__main__':
  205. main()