swire_shop_review.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import gzip
  11. import traceback
  12. from bs4 import BeautifulSoup
  13. from utility import database_access as DA
  14. from utility.parseutils import *
  15. from utility.connect import *
  16. from datetime import datetime
  17. from requests import session
  18. import pandas as pd
  19. import dataset
  20. import time
  21. import json
  22. import re
  23. import gzip
  24. import sys, os
  25. import socket
  26. import brotli
  27. import pickle
  28. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  29. import urllib.parse
  30. chrome_window=True
  31. #chrome_window=False
  32. globalkw=None
  33. proxyport=8787
  34. db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
  35. 'review_time', 'review_content', 'review_image',
  36. 'store_review_time','store_review']
  37. def write_to_file(jsobj,fname):
  38. with open(fname, 'wb') as handle:
  39. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  40. def build_cache(db):
  41. global reviews_table
  42. id_dict={}
  43. cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
  44. for c in cursor:
  45. key = '{}_{}'.format(c['fid'],c['author_id'])
  46. id_dict[key]=1
  47. return id_dict
  48. def brower_start(port):
  49. global proxyport
  50. global chrome_window
  51. print(proxyport)
  52. options = webdriver.ChromeOptions()
  53. if chrome_window:
  54. # browser = webdriver.Chrome(
  55. ## desired_capabilities=options.to_capabilities()
  56. # )
  57. options.add_argument('--ignore-certificate-errors')
  58. options.add_argument("--no-sandbox")
  59. options.add_argument("--headless")
  60. options.add_argument("--disable-gpu")
  61. options.add_argument("--disable-dev-shm-usage")
  62. browser = webdriver.Chrome(
  63. options=options
  64. # ,seleniumwire_options={'disable_encoding': True}
  65. # desired_capabilities=options.to_capabilities()
  66. )
  67. browser.set_window_size(1400,1000)
  68. else:
  69. chrome_options = webdriver.ChromeOptions()
  70. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  71. chrome_options.add_argument('--ignore-certificate-errors')
  72. chrome_options.add_argument("--no-sandbox")
  73. chrome_options.add_argument("--disable-dev-shm-usage")
  74. browser = webdriver.Remote(
  75. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  76. desired_capabilities=chrome_options.to_capabilities(),
  77. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  78. )
  79. browser.set_window_size(1400,1000)
  80. return browser
  81. def get_next_job(db):
  82. result = {}
  83. sql = '''select t1.name, t1.ludocid, t1.fid, t1.user_ratings_total, t2.place_id from
  84. (select * from shop_list3 where ludocid is NOT NULL and user_ratings_total is NOT NULL and
  85. fid not in (select fid from review_process ) ORDER BY RAND() limit 5 )
  86. as t1 join google_poi.swire_store_list as t2 on t1.fid = t2.fid'''
  87. result = db.query(sql)
  88. url_pd = pd.DataFrame([dict(i) for i in result])
  89. url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
  90. return url_pd
  91. def parsing_js(resp):
  92. jsobj = json.loads(resp[5::])
  93. result = []
  94. for i in range(len(jsobj[2])):
  95. tmp = []
  96. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  97. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  98. # image
  99. image = []
  100. if jsobj[2][i][14]:
  101. for j in range(len(jsobj[2][i][14])):
  102. image += [jsobj[2][i][14][j][6][0]]
  103. tmp += [image]
  104. # store reply
  105. if jsobj[2][i][9]:
  106. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  107. else:
  108. tmp += ['', '']
  109. tmp_dict = {}
  110. for i in range(len(db_columns)):
  111. tmp_dict[db_columns[i]] = tmp[i]
  112. tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  113. result.append(tmp_dict)
  114. # write_to_file(orig,'debug.pickle')
  115. return result
  116. def save_js_to_db(jsobj, fid):
  117. global reviews_table
  118. global iddict
  119. for r in jsobj:
  120. r['fid'] = fid
  121. key = '{}_{}'.format(r['fid'], r['author_id'])
  122. if iddict.get(key) is not None:
  123. continue
  124. try:
  125. r['review_image'] = str(r['review_image'])
  126. reviews_table.insert(r)
  127. except:
  128. traceback.print_exc()
  129. def process_web_request(driver, fid):
  130. time.sleep(3)
  131. print("ppppppppp&**********************")
  132. for request in driver.requests:
  133. if request.response:
  134. # print(request.url)
  135. if 'listentitiesreviews?' in request.url :
  136. print('parsing js:')
  137. print(request.url)
  138. # resp = brotli.decompress(request.response.body)
  139. resp=request.response.body
  140. if 'gzip' in request.response.headers.get('Content-Encoding'):
  141. resp = gzip.decompress(request.response.body)
  142. if 'br' in request.response.headers.get('Content-Encoding'):
  143. resp = brotli.decompress(request.response.body)
  144. jstext = resp.decode('utf-8')
  145. result = parsing_js(jstext)
  146. save_js_to_db(result, fid)
  147. time.sleep(1)
  148. del driver.requests
  149. return 1
  150. del driver.requests
  151. return 0
  152. def page_down_(driver, xpath_css, time_):
  153. elmts = driver.find_elements_by_xpath(xpath_css)
  154. print(elmts)
  155. if len(elmts)>1:
  156. elmt=elmts[1]
  157. else:
  158. elmt=elmts[0]
  159. actions = ActionChains(driver)
  160. actions.move_to_element(elmt).click().perform()
  161. for i in range(time_):
  162. try:
  163. actions = ActionChains(driver)
  164. actions.send_keys(Keys.PAGE_DOWN).perform()
  165. except:
  166. traceback.print_exc()
  167. time.sleep(0.5)
  168. def get_reviews(driver, reviews_cnt):
  169. wait = WebDriverWait(driver, 30)
  170. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  171. wait.until(
  172. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  173. )
  174. element = driver.find_element_by_css_selector(more_reviews_css)
  175. driver.implicitly_wait(10)
  176. ActionChains(driver).move_to_element(element).click(element).perform()
  177. time.sleep(0.5)
  178. reviews_cnt = int(reviews_cnt)
  179. if reviews_cnt > 10:
  180. page_down_count = int(reviews_cnt) // 3
  181. page_down_(driver, '//div[@class="PPCwl"]', page_down_count)
  182. def main():
  183. global chrome_window
  184. global store_list_table
  185. global reviews_table
  186. global proxyport
  187. global iddict
  188. # localip=socket.gethostbyname(socket.gethostname())
  189. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  190. store_list_table = db['swire_store_list']
  191. reviews_table = db['reviews_table']
  192. iddict=build_cache(db)
  193. port=4444
  194. if len(sys.argv) == 3 :
  195. port=int(sys.argv[1])
  196. proxyport=int(sys.argv[2])
  197. if not chrome_window:
  198. print('restart docker pw{}'.format(port))
  199. # os.system('sudo docker container restart p'+str(port))
  200. os.system('sudo docker container restart pw'+str(port))
  201. time.sleep(10)
  202. print('drvier start...')
  203. driver = brower_start(port)
  204. job = get_next_job(db)
  205. for row, group in job.iterrows():
  206. try:
  207. item_url = group['item_url']
  208. reviews_cnt = group['user_ratings_total']
  209. fid = group['fid']
  210. print(reviews_cnt, item_url)
  211. for i in range(3):
  212. print('reviews try...{}'.format(i))
  213. print("reviews try.....{}".format(datetime.now()))
  214. driver.get(item_url)
  215. time.sleep(0.5)
  216. get_reviews(driver, reviews_cnt)
  217. status = process_web_request(driver, fid)
  218. if status:
  219. db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
  220. break
  221. except:
  222. traceback.print_exc()
  223. if __name__ == '__main__':
  224. main()