swire_shop_review.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import gzip
  11. import traceback
  12. from bs4 import BeautifulSoup
  13. from utility import database_access as DA
  14. from utility.parseutils import *
  15. from utility.connect import *
  16. from datetime import datetime
  17. from requests import session
  18. import pandas as pd
  19. import dataset
  20. import time
  21. import json
  22. import re
  23. import sys, os
  24. import socket
  25. import brotli
  26. import pickle
  27. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  28. import urllib.parse
  29. chrome_window=False
  30. globalkw=None
  31. proxyport=8787
  32. db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
  33. 'review_time', 'review_content', 'review_image',
  34. 'store_review_time','store_review']
  35. def write_to_file(jsobj,fname):
  36. with open(fname, 'wb') as handle:
  37. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  38. def build_cache(db):
  39. global reviews_table
  40. id_dict={}
  41. cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
  42. for c in cursor:
  43. key = '{}_{}'.format(c['fid'],c['author_id'])
  44. id_dict[key]=1
  45. return id_dict
  46. def brower_start(port):
  47. global proxyport
  48. global chrome_window
  49. print(proxyport)
  50. options = webdriver.ChromeOptions()
  51. if chrome_window:
  52. browser = webdriver.Chrome(
  53. desired_capabilities=options.to_capabilities()
  54. )
  55. else:
  56. chrome_options = webdriver.ChromeOptions()
  57. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  58. chrome_options.add_argument('--ignore-certificate-errors')
  59. chrome_options.add_argument("--no-sandbox")
  60. chrome_options.add_argument("--disable-dev-shm-usage")
  61. browser = webdriver.Remote(
  62. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  63. desired_capabilities=chrome_options.to_capabilities(),
  64. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  65. )
  66. browser.set_window_size(1400,1000)
  67. return browser
  68. def get_next_job(db):
  69. result = {}
  70. sql = '''select t1.name, t1.ludocid, t1.fid, t1.user_ratings_total, t2.place_id from
  71. (select * from shop_list3 where ludocid is NOT NULL and user_ratings_total is NOT NULL and
  72. fid not in (select fid from review_process ) ORDER BY RAND() limit 1 )
  73. as t1 join google_poi.swire_store_list as t2 on t1.fid = t2.fid'''
  74. result = db.query(sql)
  75. url_pd = pd.DataFrame([dict(i) for i in result])
  76. url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
  77. return url_pd
  78. def parsing_js(resp):
  79. jsobj = json.loads(resp[5::])
  80. result = []
  81. for i in range(len(jsobj[2])):
  82. tmp = []
  83. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  84. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  85. # image
  86. image = []
  87. if jsobj[2][i][14]:
  88. for j in range(len(jsobj[2][i][14])):
  89. image += [jsobj[2][i][14][j][6][0]]
  90. tmp += [image]
  91. # store reply
  92. if jsobj[2][i][9]:
  93. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  94. else:
  95. tmp += ['', '']
  96. tmp_dict = {}
  97. for i in range(len(db_columns)):
  98. tmp_dict[db_columns[i]] = tmp[i]
  99. tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  100. result.append(tmp_dict)
  101. # write_to_file(orig,'debug.pickle')
  102. return result
  103. def save_js_to_db(jsobj, fid):
  104. global reviews_table
  105. global iddict
  106. for r in jsobj:
  107. r['fid'] = fid
  108. key = '{}_{}'.format(r['fid'], r['author_id'])
  109. if iddict.get(key) is not None:
  110. continue
  111. try:
  112. r['review_image'] = str(r['review_image'])
  113. reviews_table.insert(r)
  114. except:
  115. traceback.print_exc()
  116. def process_web_request(driver, fid, ludocid):
  117. time.sleep(3)
  118. print("ppppppppp&**********************")
  119. for request in driver.requests:
  120. if request.response:
  121. # print(request.url)
  122. if 'listentitiesreviews?' in request.url :
  123. if request.url.find(ludocid) != -1:
  124. print('parsing js:')
  125. print(request.url)
  126. resp = brotli.decompress(request.response.body)
  127. if 'gzip' in request.response.headers.get('Content-Encoding'):
  128. resp = gzip.decompress(request.response.body)
  129. if 'br' in request.response.headers.get('Content-Encoding'):
  130. resp = brotli.decompress(request.response.body)
  131. jstext = resp.decode('utf-8')
  132. result = parsing_js(jstext)
  133. save_js_to_db(result, fid)
  134. time.sleep(1)
  135. return 1
  136. return 0
  137. def page_down_(driver, xpath_css, time_):
  138. elmts = driver.find_elements_by_xpath(xpath_css)
  139. print(elmts)
  140. if len(elmts)>1:
  141. elmt=elmts[1]
  142. else:
  143. elmt=elmts[0]
  144. actions = ActionChains(driver)
  145. actions.move_to_element(elmt).click().perform()
  146. for i in range(time_):
  147. try:
  148. actions = ActionChains(driver)
  149. actions.send_keys(Keys.PAGE_DOWN).perform()
  150. except:
  151. traceback.print_exc()
  152. time.sleep(0.5)
  153. def get_reviews(driver, reviews_cnt):
  154. wait = WebDriverWait(driver, 30)
  155. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  156. wait.until(
  157. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  158. )
  159. element = driver.find_element_by_css_selector(more_reviews_css)
  160. driver.implicitly_wait(10)
  161. ActionChains(driver).move_to_element(element).click(element).perform()
  162. time.sleep(0.5)
  163. reviews_cnt = int(reviews_cnt)
  164. if reviews_cnt > 10:
  165. page_down_count = int(reviews_cnt) // 3
  166. page_down_(driver, '//div[@class="PPCwl"]', page_down_count)
  167. def main():
  168. global chrome_window
  169. global store_list_table
  170. global reviews_table
  171. global proxyport
  172. global iddict
  173. # localip=socket.gethostbyname(socket.gethostname())
  174. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  175. store_list_table = db['swire_store_list']
  176. reviews_table = db['reviews_table']
  177. iddict=build_cache(db)
  178. port=4444
  179. if len(sys.argv) == 3 :
  180. port=int(sys.argv[1])
  181. proxyport=int(sys.argv[2])
  182. if not chrome_window:
  183. print('restart docker pw{}'.format(port))
  184. # os.system('sudo docker container restart p'+str(port))
  185. os.system('sudo docker container restart pw'+str(port))
  186. time.sleep(10)
  187. print('drvier start...')
  188. driver = brower_start(port)
  189. job = get_next_job(db)
  190. for row, group in job.iterrows():
  191. try:
  192. item_url = group['item_url']
  193. reviews_cnt = group['reviews_cnt']
  194. fid = group['fid']
  195. ludocid = group['ludocid']
  196. print(reviews_cnt, item_url)
  197. for i in range(3):
  198. print('reviews try...{}'.format(i))
  199. print("reviews try.....{}".format(datetime.now()))
  200. driver.get(item_url)
  201. time.sleep(0.5)
  202. get_reviews(driver, reviews_cnt)
  203. status = process_web_request(driver, fid, ludocid)
  204. print(driver.current_url)
  205. if status:
  206. db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
  207. break
  208. except:
  209. traceback.print_exc()
  210. if __name__ == '__main__':
  211. main()