run4.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from tkinter.tix import TEXT
  4. from seleniumwire import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.support.wait import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. import selenium
  11. import traceback
  12. from bs4 import BeautifulSoup
  13. from utility import database_access as DA
  14. from utility.parseutils import *
  15. from utility.connect import *
  16. from datetime import datetime
  17. from requests import session
  18. import pandas as pd
  19. import dataset
  20. import time
  21. import json
  22. import re
  23. import sys, os
  24. import socket
  25. import brotli
  26. import pickle
  27. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  28. import urllib.parse
  29. chrome_window=False
  30. globalkw=None
  31. proxyport=8787
  32. db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
  33. 'created_at', 'text', 'photos', 'store_review_time','store_review']
  34. def write_to_file(jsobj,fname):
  35. with open(fname, 'wb') as handle:
  36. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  37. # import codecs
  38. # fw=codecs.open(fname,'w','utf-8')
  39. # fw.write(str(jsobj))
  40. # fw.close()
  41. def build_cache(db):
  42. global reviews_table
  43. id_dict={}
  44. cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
  45. for c in cursor:
  46. key = '{}_{}'.format(c['fid'],c['author_id'])
  47. id_dict[key]=1
  48. return id_dict
  49. def brower_start(port):
  50. global proxyport
  51. global chrome_window
  52. print(proxyport)
  53. options = webdriver.ChromeOptions()
  54. if chrome_window:
  55. browser = webdriver.Chrome(
  56. desired_capabilities=options.to_capabilities()
  57. )
  58. else:
  59. chrome_options = webdriver.ChromeOptions()
  60. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  61. chrome_options.add_argument('--ignore-certificate-errors')
  62. chrome_options.add_argument("--no-sandbox")
  63. chrome_options.add_argument("--disable-dev-shm-usage")
  64. browser = webdriver.Remote(
  65. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  66. desired_capabilities=chrome_options.to_capabilities(),
  67. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  68. )
  69. browser.set_window_size(1400,1000)
  70. return browser
  71. def get_next_job(db):
  72. result = {}
  73. result = db.query('select * from error_list2 ORDER BY RAND() limit 2')
  74. url_pd = pd.DataFrame([dict(i) for i in result])
  75. # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  76. # remove = db.query('select fid from review_process')
  77. # remove = pd.DataFrame([dict(i) for i in remove])
  78. # remove_fid_list = remove['fid'].to_list()
  79. # url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
  80. return url_pd
  81. def parsing_js(resp, db_name):
  82. txt = json.loads(resp[5::])
  83. output = {}
  84. if txt[6][11] != db_name:
  85. return 0
  86. output['name'] = txt[6][11]
  87. output['adress_name'] = txt[6][18]
  88. if txt[6][4]:
  89. if txt[6][4][7]:
  90. output['rating'] = str(txt[6][4][7])
  91. else:
  92. output['rating'] = ''
  93. if txt[6][4][8]:
  94. output['user_ratings_total'] = str(txt[6][4][8])
  95. else:
  96. output['user_ratings_total'] = ''
  97. if txt[6][4][2]:
  98. output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
  99. else:
  100. output['price_level'] = ''
  101. else:
  102. output['rating'] = ''
  103. output['user_ratings_total'] = ''
  104. output['price_level'] = ''
  105. if txt[6][37][0]:
  106. output['lon'] = txt[6][37][0][0][8][0][1]
  107. output['lat'] = txt[6][37][0][0][8][0][2]
  108. else:
  109. output['lon'] = None
  110. output['lat'] = None
  111. if txt[6][178]:
  112. output['tel'] = txt[6][178][0][0]
  113. else:
  114. output['tel'] = ''
  115. if txt[6][13]:
  116. output['category'] = txt[6][13][0]
  117. else:
  118. output['category'] = ''
  119. try:
  120. location = txt[6][183][2][2][0]
  121. if location:
  122. location_s = location.split(' ')
  123. output['city'], output['area'] = location_s[-1], location_s[-2]
  124. else:
  125. output['city'], output['area'] = '', ''
  126. except:
  127. output['city'], output['area'] = '', ''
  128. if txt[6][100]:
  129. for item in txt[6][100][1]:
  130. name = item[1]
  131. if name not in intro_list.keys(): continue
  132. name_map = intro_list[name]
  133. c = 0
  134. detail = []
  135. for t in item[2]:
  136. value = t[1]
  137. if t[3] == 1:
  138. detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
  139. else:
  140. detail += [{'id':c, name_map[1]:value}]
  141. c += 1
  142. output[name_map[0]] = str(detail)
  143. for key in intro_list:
  144. if intro_list[key][0] not in output.keys():
  145. output[intro_list[key][0]] = '[]'
  146. if txt[6][34]:
  147. output = time_parsing_js(txt[6][34], output)
  148. else:
  149. output['open_now'] = 'False'
  150. output['periods'] = ''
  151. output['weekday_text'] = ''
  152. output['time_status'] = ''
  153. if txt[6][72]:
  154. output['header_image'] = txt[6][72][0][0][6][0]
  155. else:
  156. output['header_image'] = ''
  157. print(output)
  158. # write_to_file(orig,'debug.pickle')
  159. return output
  160. def time_parsing_js(time_json, output):
  161. weekday_text = []
  162. periods = []
  163. for time_ in time_json[1]:
  164. week = time_[0]
  165. weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
  166. for t in time_[1]:
  167. if t == '24 小時營業':
  168. periods += [{
  169. "open":{
  170. "day": week_list[week],
  171. "time": '0000'
  172. },
  173. "close":{
  174. "day": week_list[week],
  175. "time": ''
  176. }
  177. }]
  178. elif t == '休息':
  179. periods += [{
  180. "open":{
  181. "day": week_list[week],
  182. "time": ''
  183. },
  184. "close":{
  185. "day": week_list[week],
  186. "time": ''
  187. }
  188. }]
  189. else:
  190. start, end = t.split('–')
  191. end_hour, end_min = end.split(':')
  192. start_hour, start_min = start.split(':')
  193. if end_hour < start_hour:
  194. end_day = week_list[week] + 1
  195. else:
  196. end_day = week_list[week]
  197. periods += [{
  198. "open":{
  199. "day": week_list[week],
  200. "time": start.replace(':','')
  201. },
  202. "close":{
  203. "day": end_day,
  204. "time": end.replace(':','')
  205. }
  206. }]
  207. output['periods'] = str(periods)
  208. output['weekday_text'] = str(weekday_text)
  209. output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
  210. if output['time_status'].find('永久停業') != -1 or\
  211. output['time_status'].find('暫時關閉') != -1 or\
  212. output['time_status'].find('暫停營業') != -1:
  213. output['open_now'] = 'False'
  214. else:
  215. output['open_now'] = 'True'
  216. return output
  217. def save_js_to_db(jsobj, fid):
  218. global reviews_table
  219. global iddict
  220. for r in jsobj:
  221. r['fid'] = fid
  222. key = '{}_{}'.format(r['fid'], r['author_id'])
  223. if iddict.get(key) is not None:
  224. continue
  225. try:
  226. r['review_image'] = str(r['review_image'])
  227. reviews_table.insert(r)
  228. except:
  229. traceback.print_exc()
  230. def process_web_request_start(driver, db_name):
  231. time.sleep(5)
  232. print("start&**********************")
  233. for request in driver.requests:
  234. if request.response:
  235. # print(request.url)
  236. if 'place?' in request.url :
  237. print('parsing js:')
  238. print(request.url)
  239. resp = brotli.decompress(request.response.body)
  240. jstext = resp.decode('utf-8')
  241. output = parsing_js(jstext, db_name)
  242. time.sleep(1)
  243. return output
  244. return 0
  245. def reviews_parsing_js(resp):
  246. columns_name = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
  247. 'review_time', 'review_content', 'review_image',
  248. 'rating', 'store_review_time','store_review']
  249. jsobj = json.loads(resp[5::])
  250. result = []
  251. for i in range(len(jsobj[2])):
  252. tmp = []
  253. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  254. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  255. # image
  256. image = []
  257. if jsobj[2][i][14]:
  258. for j in range(len(jsobj[2][i][14])):
  259. image += [jsobj[2][i][14][j][6][0]]
  260. tmp += [image]
  261. #rating
  262. tmp += [jsobj[2][i][4]]
  263. # store reply
  264. if jsobj[2][i][9]:
  265. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  266. else:
  267. tmp += ['', '']
  268. result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
  269. return result
  270. def process_web_request_reviews(driver, output):
  271. time.sleep(0.8)
  272. time.sleep(3)
  273. print("reviews&**********************")
  274. for request in driver.requests:
  275. if request.response:
  276. # print(request.url)
  277. if 'listentitiesreviews?' in request.url :
  278. print('parsing js:')
  279. print(request.url)
  280. resp = brotli.decompress(request.response.body)
  281. jstext = resp.decode('utf-8')
  282. result = reviews_parsing_js(jstext)
  283. output['reviews'] = str(result)
  284. time.sleep(1)
  285. return output
  286. def photos_parsing_js(resp, c):
  287. def image_url_change_size(url):
  288. url_split = url.split('=')
  289. new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
  290. return new_url
  291. jsobj = json.loads(resp[5::])
  292. # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
  293. menu = []
  294. all = []
  295. if jsobj[10] == 0:
  296. for img in jsobj[0]:
  297. all += [image_url_change_size(img[6][0])]
  298. else:
  299. for img in jsobj[0]:
  300. menu += [image_url_change_size(img[6][0])]
  301. return menu, all
  302. def process_web_request_photo(driver, output):
  303. try:
  304. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
  305. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  306. tab_dict = {}
  307. for tab_index in [0, 1, 2]:
  308. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  309. if len(selector) != 0:
  310. photo_name = selector[0].text
  311. if photo_name == '菜單':
  312. tab_dict[photo_name] = tab_index
  313. elif photo_name == '全部':
  314. tab_dict[photo_name] = tab_index
  315. except:
  316. tab_dict = {}
  317. print(tab_dict)
  318. for tab_ in tab_dict:
  319. tab_index = tab_dict[tab_]
  320. print(tab_index)
  321. wait = WebDriverWait(driver, 60)
  322. wait.until(
  323. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  324. )
  325. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  326. ActionChains(driver).move_to_element(element).click(element).perform()
  327. time.sleep(2)
  328. print("photo&**********************")
  329. menu_list = []
  330. all_list = []
  331. for request in driver.requests:
  332. if request.response:
  333. # print(request.url)
  334. c = 0
  335. if 'photo?' in request.url :
  336. print('parsing js:')
  337. print(request.url)
  338. resp = brotli.decompress(request.response.body)
  339. jstext = resp.decode('utf-8')
  340. menu, all = photos_parsing_js(jstext, c)
  341. menu_list += menu
  342. all_list += all
  343. c += 1
  344. output['shop_photo'] = str(all_list)
  345. output['menu_photo'] = str(menu_list)
  346. return output
  347. def main():
  348. global chrome_window
  349. global store_list_table
  350. global reviews_table
  351. global proxyport
  352. global iddict
  353. localip=socket.gethostbyname(socket.gethostname())
  354. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  355. store_list_table = db['swire_store_list']
  356. shop_table = db['shop_list4']
  357. iddict=build_cache(db)
  358. port=4444
  359. if len(sys.argv) == 3 :
  360. port=int(sys.argv[1])
  361. proxyport=int(sys.argv[2])
  362. if not chrome_window:
  363. print('restart docker pw{}'.format(port))
  364. # os.system('sudo docker container restart p'+str(port))
  365. os.system('sudo docker container restart pw'+str(port))
  366. time.sleep(10)
  367. print('drvier start...')
  368. driver = brower_start(port)
  369. job = get_next_job(db)
  370. for row, group in job.iterrows():
  371. try:
  372. item_url = group['item_url']
  373. name = group['name']
  374. num = group['num']
  375. keyword = group['keyword']
  376. if name:
  377. db_name = name
  378. else:
  379. db_name = num
  380. print(name, num, keyword, db_name)
  381. print(item_url)
  382. #shop_info
  383. print('parsing shop info....')
  384. for i in range(5):
  385. print('shop info try...{}'.format(i))
  386. driver.get(item_url)
  387. time.sleep(2)
  388. element = driver.find_element_by_id('searchbox-searchbutton')
  389. driver.implicitly_wait(10)
  390. ActionChains(driver).move_to_element(element).click(element).perform()
  391. time.sleep(5)
  392. driver.back()
  393. if driver.current_url == item_url:continue
  394. print(driver.current_url)
  395. output = process_web_request_start(driver, db_name)
  396. if output != 0: break
  397. # reivews
  398. print('parsing reviews....')
  399. if output['user_ratings_total'] == '':
  400. output['reviews'] = ''
  401. else:
  402. for i in range(3):
  403. print('reviews try...{}'.format(i))
  404. try:
  405. wait = WebDriverWait(driver, 30)
  406. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  407. wait.until(
  408. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  409. )
  410. element = driver.find_element_by_css_selector(more_reviews_css)
  411. driver.implicitly_wait(10)
  412. ActionChains(driver).move_to_element(element).click(element).perform()
  413. time.sleep(0.5)
  414. output = process_web_request_reviews(driver, output)
  415. break
  416. except:
  417. driver.get(item_url)
  418. time.sleep(0.5)
  419. # photo
  420. print('parsing photo....')
  421. if output['header_image'] != '':
  422. for i in range(3):
  423. print('photo try...{}'.format(i))
  424. driver.get(item_url)
  425. time.sleep(0.5)
  426. print(driver.current_url)
  427. try:
  428. wait = WebDriverWait(driver, 30)
  429. wait.until(
  430. EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
  431. )
  432. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  433. ActionChains(driver).move_to_element(element).click(element).perform()
  434. output = process_web_request_photo(driver, output)
  435. break
  436. except:
  437. pass
  438. else:
  439. output['shop_photo'] = '[]'
  440. output['menu_photo'] = '[]'
  441. print(output)
  442. query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
  443. output['item_url'] = item_url
  444. output['keyword'] = keyword
  445. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  446. output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  447. shop_table.insert(output,['item_url'])
  448. except:
  449. traceback.print_exc()
  450. if __name__ == '__main__':
  451. main()