swire_shop_review.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from seleniumwire import webdriver
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. import selenium
  10. import traceback
  11. from bs4 import BeautifulSoup
  12. from utility import database_access as DA
  13. from utility.parseutils import *
  14. from utility.connect import *
  15. from datetime import datetime
  16. from requests import session
  17. import pandas as pd
  18. import dataset
  19. import time
  20. import json
  21. import re
  22. import gzip
  23. import sys, os
  24. import socket
  25. import brotli
  26. import pickle
  27. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  28. import urllib.parse
  29. chrome_window=True
  30. #chrome_window=False
  31. globalkw=None
  32. proxyport=8787
  33. db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
  34. 'review_time', 'review_content', 'review_image',
  35. 'store_review_time','store_review']
  36. def write_to_file(jsobj,fname):
  37. with open(fname, 'wb') as handle:
  38. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  39. # import codecs
  40. # fw=codecs.open(fname,'w','utf-8')
  41. # fw.write(str(jsobj))
  42. # fw.close()
  43. def build_cache(db):
  44. global reviews_table
  45. id_dict={}
  46. cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
  47. for c in cursor:
  48. key = '{}_{}'.format(c['fid'],c['author_id'])
  49. id_dict[key]=1
  50. return id_dict
  51. def brower_start(port):
  52. global proxyport
  53. global chrome_window
  54. print(proxyport)
  55. options = webdriver.ChromeOptions()
  56. if chrome_window:
  57. # browser = webdriver.Chrome(
  58. ## desired_capabilities=options.to_capabilities()
  59. # )
  60. options.add_argument('--ignore-certificate-errors')
  61. options.add_argument("--no-sandbox")
  62. options.add_argument("--headless")
  63. options.add_argument("--disable-gpu")
  64. options.add_argument("--disable-dev-shm-usage")
  65. browser = webdriver.Chrome(
  66. options=options
  67. # ,seleniumwire_options={'disable_encoding': True}
  68. # desired_capabilities=options.to_capabilities()
  69. )
  70. browser.set_window_size(1400,1000)
  71. else:
  72. chrome_options = webdriver.ChromeOptions()
  73. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  74. chrome_options.add_argument('--ignore-certificate-errors')
  75. chrome_options.add_argument("--no-sandbox")
  76. chrome_options.add_argument("--disable-dev-shm-usage")
  77. browser = webdriver.Remote(
  78. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  79. desired_capabilities=chrome_options.to_capabilities(),
  80. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  81. )
  82. browser.set_window_size(1400,1000)
  83. return browser
  84. def get_next_job(db):
  85. result = {}
  86. result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')
  87. url_pd = pd.DataFrame([dict(i) for i in result])
  88. url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  89. remove = db.query('select fid from review_process')
  90. remove = pd.DataFrame([dict(i) for i in remove])
  91. remove_fid_list = remove['fid'].to_list()
  92. url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
  93. return url_pd
  94. def parsing_js(resp):
  95. jsobj = json.loads(resp[5::])
  96. result = []
  97. for i in range(len(jsobj[2])):
  98. tmp = []
  99. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  100. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  101. # image
  102. image = []
  103. if jsobj[2][i][14]:
  104. for j in range(len(jsobj[2][i][14])):
  105. image += [jsobj[2][i][14][j][6][0]]
  106. tmp += [image]
  107. # store reply
  108. if jsobj[2][i][9]:
  109. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  110. else:
  111. tmp += ['', '']
  112. tmp_dict = {}
  113. for i in range(len(db_columns)):
  114. tmp_dict[db_columns[i]] = tmp[i]
  115. tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  116. result.append(tmp_dict)
  117. # write_to_file(orig,'debug.pickle')
  118. return result
  119. def save_js_to_db(jsobj, fid):
  120. global reviews_table
  121. global iddict
  122. for r in jsobj:
  123. r['fid'] = fid
  124. key = '{}_{}'.format(r['fid'], r['author_id'])
  125. if iddict.get(key) is not None:
  126. continue
  127. try:
  128. r['review_image'] = str(r['review_image'])
  129. reviews_table.insert(r)
  130. except:
  131. traceback.print_exc()
  132. def process_web_request(db, driver, fid):
  133. time.sleep(0.8)
  134. time.sleep(3)
  135. print("ppppppppp&**********************")
  136. for request in driver.requests:
  137. if request.response:
  138. # print(request.url)
  139. if 'listentitiesreviews?' in request.url :
  140. print('parsing js:')
  141. print(request.url)
  142. resp=request.response.body
  143. if 'gzip' in request.response.headers.get('Content-Encoding'):
  144. resp = gzip.decompress(request.response.body)
  145. if 'br' in request.response.headers.get('Content-Encoding'):
  146. resp = brotli.decompress(request.response.body)
  147. # resp = brotli.decompress(request.response.body)
  148. jstext = resp.decode('utf-8')
  149. result = parsing_js(jstext)
  150. save_js_to_db(result, fid)
  151. time.sleep(1)
  152. del driver.requests
  153. def page_down_(driver, xpath_css, time_):
  154. elmts = driver.find_elements_by_xpath(xpath_css)
  155. print(elmts)
  156. if len(elmts)>1:
  157. elmt=elmts[1]
  158. else:
  159. elmt=elmts[0]
  160. actions = ActionChains(driver)
  161. actions.move_to_element(elmt).click().perform()
  162. for i in range(time_):
  163. try:
  164. actions = ActionChains(driver)
  165. actions.send_keys(Keys.PAGE_DOWN).perform()
  166. except:
  167. traceback.print_exc()
  168. time.sleep(0.5)
  169. def get_reviews(driver, reviews_cnt):
  170. wait = WebDriverWait(driver, 30)
  171. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  172. wait.until(
  173. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  174. )
  175. element = driver.find_element_by_css_selector(more_reviews_css)
  176. driver.implicitly_wait(10)
  177. ActionChains(driver).move_to_element(element).click(element).perform()
  178. time.sleep(0.5)
  179. reviews_cnt = int(reviews_cnt)
  180. if reviews_cnt > 10:
  181. page_down_count = int(reviews_cnt) // 3
  182. page_down_(driver, '//div[@class="PPCwl"]', page_down_count)
  183. def main():
  184. global chrome_window
  185. global store_list_table
  186. global reviews_table
  187. global proxyport
  188. global iddict
  189. localip=socket.gethostbyname(socket.gethostname())
  190. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  191. store_list_table = db['swire_store_list']
  192. reviews_table = db['reviews_table']
  193. iddict=build_cache(db)
  194. port=4444
  195. if len(sys.argv) == 3 :
  196. port=int(sys.argv[1])
  197. proxyport=int(sys.argv[2])
  198. if not chrome_window:
  199. print('restart docker pw{}'.format(port))
  200. # os.system('sudo docker container restart p'+str(port))
  201. os.system('sudo docker container restart pw'+str(port))
  202. time.sleep(10)
  203. print('drvier start...')
  204. driver = brower_start(port)
  205. job = get_next_job(db)
  206. for row, group in job.iterrows():
  207. try:
  208. item_url = group['item_url']
  209. reviews_cnt = group['reviews_cnt']
  210. fid = group['fid']
  211. print(reviews_cnt, item_url)
  212. driver.get(item_url)
  213. time.sleep(0.5)
  214. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  215. tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})
  216. if tmp_value:
  217. get_reviews(driver, reviews_cnt)
  218. process_web_request(db, driver, fid)
  219. print(driver.current_url)
  220. db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
  221. except:
  222. traceback.print_exc()
  223. if __name__ == '__main__':
  224. main()