run4.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. #from tkinter.tix import TEXT
  4. from seleniumwire import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.support.wait import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.common.exceptions import TimeoutException
  11. from selenium.common.exceptions import WebDriverException
  12. import selenium
  13. import traceback
  14. from bs4 import BeautifulSoup
  15. import gzip
  16. from utility import database_access as DA
  17. from utility.parseutils import *
  18. from utility.connect import *
  19. import redis
  20. from datetime import datetime
  21. from requests import session
  22. import pandas as pd
  23. import dataset
  24. import time
  25. import json
  26. import re
  27. import sys, os
  28. import socket
  29. import brotli
  30. import pickle
  31. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  32. import urllib.parse
  33. chrome_window=False
  34. globalkw=None
  35. proxyport=8787
  36. def write_to_file(jsobj,fname):
  37. with open(fname, 'wb') as handle:
  38. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  39. def build_cache(db):
  40. global reviews_table
  41. id_dict={}
  42. cursor = db.query('SELECT fid FROM google_poi.shop_list3;')
  43. for c in cursor:
  44. key = '{}'.format(c['fid'])
  45. id_dict[key]=1
  46. return id_dict
  47. #def brower_start(port):
  48. # global proxyport
  49. # global chrome_window
  50. # print(proxyport)
  51. # options = webdriver.ChromeOptions()
  52. # if chrome_window:
  53. # browser = webdriver.Chrome(
  54. # desired_capabilities=options.to_capabilities()
  55. # )
  56. # else:
  57. # chrome_options = webdriver.ChromeOptions()
  58. # chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  59. # chrome_options.add_argument('--ignore-certificate-errors')
  60. # chrome_options.add_argument("--no-sandbox")
  61. # chrome_options.add_argument("--disable-dev-shm-usage")
  62. # browser = webdriver.Remote(
  63. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  64. # desired_capabilities=chrome_options.to_capabilities(),
  65. # seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  66. # )
  67. # browser.set_window_size(1400,1000)
  68. # return browser
  69. def brower_start(port):
  70. options = webdriver.ChromeOptions()
  71. # browser = webdriver.Chrome(options=options)
  72. options.add_argument('--ignore-certificate-errors')
  73. options.add_argument("--no-sandbox")
  74. options.add_argument("--headless")
  75. options.add_argument("--disable-gpu")
  76. options.add_argument("--disable-dev-shm-usage")
  77. browser = webdriver.Chrome(options=options)
  78. browser.set_window_size(1400,1000)
  79. # browser = webdriver.Remote(
  80. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  81. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  82. # desired_capabilities=options.to_capabilities()
  83. # )
  84. return browser
  85. def get_next_job(db):
  86. result = {}
  87. # result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 100')
  88. # result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 30')
  89. # result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from shop_list3 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 30')
  90. result = db.query('SELECT * FROM swire_store_list a WHERE fid not in (select fid from shop_list3 ) ORDER BY RAND() limit 30')
  91. url_pd = pd.DataFrame([dict(i) for i in result])
  92. url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
  93. # url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
  94. # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  95. # remove = db.query('select item_url from error_list3')
  96. # remove = pd.DataFrame([dict(i) for i in remove])
  97. # if len(remove) != 0:
  98. # remove_fid_list = remove['item_url'].to_list()
  99. # url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
  100. return url_pd
  101. def parsing_js(resp):
  102. txt = json.loads(resp[5::])
  103. output = {}
  104. output['name'] = txt[6][11]
  105. output['adress_name'] = txt[6][18]
  106. if txt[6][4]:
  107. if txt[6][4][7]:
  108. output['rating'] = str(txt[6][4][7])
  109. else:
  110. output['rating'] = None
  111. if txt[6][4][8]:
  112. output['user_ratings_total'] = str(txt[6][4][8])
  113. else:
  114. output['user_ratings_total'] = None
  115. if txt[6][4][2]:
  116. output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
  117. else:
  118. output['price_level'] = None
  119. else:
  120. output['rating'] = None
  121. output['user_ratings_total'] = None
  122. output['price_level'] = None
  123. if txt[6][37][0]:
  124. output['lon'] = txt[6][37][0][0][8][0][1]
  125. output['lat'] = txt[6][37][0][0][8][0][2]
  126. else:
  127. output['lon'] = None
  128. output['lat'] = None
  129. if txt[6][178]:
  130. output['tel'] = txt[6][178][0][0]
  131. else:
  132. output['tel'] = ''
  133. if txt[6][13]:
  134. output['category'] = txt[6][13][0]
  135. else:
  136. output['category'] = ''
  137. try:
  138. location = txt[6][183][2][2][0]
  139. if location:
  140. location_s = location.split(' ')
  141. output['city'], output['area'] = location_s[-1], location_s[-2]
  142. else:
  143. output['city'], output['area'] = '', ''
  144. except:
  145. output['city'], output['area'] = '', ''
  146. if txt[6][100]:
  147. for item in txt[6][100][1]:
  148. name = item[1]
  149. if name not in intro_list.keys(): continue
  150. name_map = intro_list[name]
  151. c = 0
  152. detail = []
  153. for t in item[2]:
  154. value = t[1]
  155. if t[3] == 1:
  156. detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
  157. else:
  158. detail += [{'id':c, name_map[1]:value}]
  159. c += 1
  160. output[name_map[0]] = str(detail)
  161. for key in intro_list:
  162. if intro_list[key][0] not in output.keys():
  163. output[intro_list[key][0]] = '[]'
  164. if txt[6][34]:
  165. output = time_parsing_js(txt[6][34], output)
  166. else:
  167. output['open_now'] = 'False'
  168. output['periods'] = ''
  169. output['weekday_text'] = ''
  170. output['time_status'] = ''
  171. if txt[6][72]:
  172. output['header_image'] = txt[6][72][0][0][6][0]
  173. else:
  174. output['header_image'] = ''
  175. if txt[6][126]:
  176. output['google_url'] = txt[6][126][4]
  177. ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1]
  178. if len(ludocid_str) != 0:
  179. ludocid = ludocid_str[0].split('=')[-1]
  180. output['ludocid'] = ludocid
  181. else:
  182. output['google_url'] = ''
  183. # write_to_file(orig,'debug.pickle')
  184. return output
  185. def time_parsing_js(time_json, output):
  186. weekday_text = []
  187. periods = []
  188. if time_json is None:
  189. output['open_now'] = 'False'
  190. output['periods'] = ''
  191. output['weekday_text'] = ''
  192. output['time_status'] = ''
  193. return output
  194. if time_json[1] is None:
  195. output['open_now'] = 'False'
  196. output['periods'] = ''
  197. output['weekday_text'] = ''
  198. output['time_status'] = ''
  199. return output
  200. for time_ in time_json[1]:
  201. week = time_[0]
  202. weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
  203. for t in time_[1]:
  204. if t == '24 小時營業':
  205. periods += [{
  206. "open":{
  207. "day": week_list[week],
  208. "time": '0000'
  209. },
  210. "close":{
  211. "day": week_list[week],
  212. "time": ''
  213. }
  214. }]
  215. elif t == '休息':
  216. periods += [{
  217. "open":{
  218. "day": week_list[week],
  219. "time": ''
  220. },
  221. "close":{
  222. "day": week_list[week],
  223. "time": ''
  224. }
  225. }]
  226. else:
  227. start, end = t.split('–')
  228. end_hour, end_min = end.split(':')
  229. start_hour, start_min = start.split(':')
  230. if end_hour < start_hour:
  231. end_day = week_list[week] + 1
  232. else:
  233. end_day = week_list[week]
  234. periods += [{
  235. "open":{
  236. "day": week_list[week],
  237. "time": start.replace(':','')
  238. },
  239. "close":{
  240. "day": end_day,
  241. "time": end.replace(':','')
  242. }
  243. }]
  244. output['periods'] = str(periods)
  245. output['weekday_text'] = str(weekday_text)
  246. output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
  247. if output['time_status'].find('永久停業') != -1 or\
  248. output['time_status'].find('暫時關閉') != -1 or\
  249. output['time_status'].find('暫停營業') != -1:
  250. output['open_now'] = 'False'
  251. else:
  252. output['open_now'] = 'True'
  253. return output
  254. def save_js_to_db(jsobj, fid):
  255. global shop_table
  256. global iddict
  257. jsobj['fid'] = fid
  258. if iddict.get(fid) is None:
  259. try:
  260. shop_table.insert(jsobj)
  261. except:
  262. traceback.print_exc()
  263. def process_web_request_start(driver, fid):
  264. time.sleep(3)
  265. print("start&**********************")
  266. for request in driver.requests:
  267. if request.response:
  268. # print(request.url)
  269. if 'place?' in request.url :
  270. # print('parsing js:')
  271. front, _ = fid.split(':')
  272. if request.url.find(front) != -1:
  273. print(request.url)
  274. # resp = brotli.decompress(request.response.body)
  275. resp=request.response.body
  276. if 'gzip' in request.response.headers.get('Content-Encoding'):
  277. resp = gzip.decompress(request.response.body)
  278. if 'br' in request.response.headers.get('Content-Encoding'):
  279. resp = brotli.decompress(request.response.body)
  280. # resp = brotli.decompress(request.response.body)
  281. jstext = resp.decode('utf-8')
  282. output = parsing_js(jstext)
  283. time.sleep(1)
  284. return output
  285. return 0
  286. def reviews_parsing_js(resp):
  287. columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
  288. 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
  289. jsobj = json.loads(resp[5::])
  290. result = []
  291. for i in range(len(jsobj[2])):
  292. tmp = []
  293. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  294. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  295. # image
  296. image = []
  297. if jsobj[2][i][14]:
  298. for j in range(len(jsobj[2][i][14])):
  299. image += [jsobj[2][i][14][j][6][0]]
  300. tmp += [image]
  301. #rating
  302. tmp += [jsobj[2][i][4]]
  303. # store reply
  304. if jsobj[2][i][9]:
  305. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  306. else:
  307. tmp += ['', '']
  308. result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
  309. return result
  310. def process_web_request_reviews(driver, output, ludocid):
  311. time.sleep(3)
  312. print("reviews&**********************")
  313. for request in driver.requests:
  314. if request.response:
  315. # print(request.url)
  316. if 'listentitiesreviews?' in request.url :
  317. # print('parsing js:')
  318. if request.url.find(ludocid) != -1:
  319. print(request.url)
  320. # resp = brotli.decompress(request.response.body)
  321. # jstext = resp.decode('utf-8')
  322. # result = reviews_parsing_js(jstext)
  323. # resp = brotli.decompress(request.response.body)
  324. resp=request.response.body
  325. if 'gzip' in request.response.headers.get('Content-Encoding'):
  326. resp = gzip.decompress(request.response.body)
  327. if 'br' in request.response.headers.get('Content-Encoding'):
  328. resp = brotli.decompress(request.response.body)
  329. # resp = brotli.decompress(request.response.body)
  330. jstext = resp.decode('utf-8')
  331. result = parsing_js(jstext)
  332. output['reviews'] = str(result)
  333. time.sleep(1)
  334. return output
  335. return 0
  336. def photos_parsing_js(resp):
  337. def image_url_change_size(url):
  338. if url.find('streetviewpixels') != -1:
  339. return url
  340. else:
  341. url_split = url.split('=')
  342. new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
  343. return new_url
  344. jsobj = json.loads(resp[5::])
  345. # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
  346. menu = []
  347. all = []
  348. photo_category_map = {}
  349. for row in jsobj[12][0]:
  350. photo_category_map[row[0]] = row[2]
  351. if photo_category_map[jsobj[13][0]] == '全部':
  352. for img in jsobj[0]:
  353. all += [image_url_change_size(img[6][0])]
  354. elif photo_category_map[jsobj[13][0]] == '菜單':
  355. for img in jsobj[0]:
  356. menu += [image_url_change_size(img[6][0])]
  357. return list(set(menu)), list(set(all))
  358. def process_web_request_photo(driver, output, fid):
  359. try:
  360. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
  361. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  362. tab_dict = {}
  363. for tab_index in [0, 1, 2]:
  364. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  365. if len(selector) != 0:
  366. photo_name = selector[0].text
  367. if photo_name == '菜單':
  368. tab_dict[photo_name] = tab_index
  369. elif photo_name == '全部':
  370. tab_dict[photo_name] = tab_index
  371. except:
  372. tab_dict = {}
  373. print(tab_dict)
  374. for tab_ in tab_dict:
  375. tab_index = tab_dict[tab_]
  376. print(tab_index)
  377. wait = WebDriverWait(driver, 60)
  378. wait.until(
  379. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  380. )
  381. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  382. ActionChains(driver).move_to_element(element).click(element).perform()
  383. time.sleep(1)
  384. print("photo&**********************")
  385. menu_list = []
  386. all_list = []
  387. for request in driver.requests:
  388. if request.response:
  389. # print(request.url)
  390. if 'photo?' in request.url :
  391. # print('parsing js:')
  392. front, _ = fid.split(':')
  393. if request.url.find(front) != -1:
  394. # resp = brotli.decompress(request.response.body)
  395. print(request.url)
  396. resp=request.response.body
  397. if 'gzip' in request.response.headers.get('Content-Encoding'):
  398. resp = gzip.decompress(request.response.body)
  399. if 'br' in request.response.headers.get('Content-Encoding'):
  400. resp = brotli.decompress(request.response.body)
  401. jstext = resp.decode('utf-8')
  402. menu, all = photos_parsing_js(jstext)
  403. menu_list += menu
  404. all_list += all
  405. output['shop_photo'] = str(all_list[:5])
  406. output['menu_photo'] = str(menu_list[:5])
  407. return output
  408. def main():
  409. global chrome_window
  410. global store_list_table
  411. global shop_table
  412. global proxyport
  413. global iddict
  414. localip=socket.gethostbyname(socket.gethostname())
  415. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  416. store_list_table = db['swire_store_list']
  417. shop_table = db['shop_list3']
  418. error_table = db['error_list2']
  419. iddict=build_cache(db)
  420. print("iddict...{}".format(datetime.now()))
  421. port=4444
  422. if len(sys.argv) == 3 :
  423. port=int(sys.argv[1])
  424. proxyport=int(sys.argv[2])
  425. if not chrome_window:
  426. print('restart docker pw{}'.format(port))
  427. # os.system('sudo docker container restart p'+str(port))
  428. # os.system('sudo docker container restart pw'+str(port))
  429. # time.sleep(10)
  430. print('drvier start...')
  431. driver = brower_start(port)
  432. job = get_next_job(db)
  433. c = 0
  434. for row, group in job.iterrows():
  435. try:
  436. item_url = group['item_url']
  437. name = group['name']
  438. num = group['num']
  439. keyword = group['keyword']
  440. fid = group['fid']
  441. if name:
  442. db_name = name
  443. else:
  444. db_name = num
  445. print(fid, keyword, db_name)
  446. print(item_url)
  447. #shop_info
  448. print('parsing shop info....')
  449. for i in range(5):
  450. print('shop info try...{}'.format(i))
  451. print("shop info try...{}".format(datetime.now()))
  452. driver.get(item_url)
  453. time.sleep(3)
  454. wait = WebDriverWait(driver, 10)
  455. wait.until(
  456. EC.element_to_be_clickable((By.ID, 'sb_cb50'))
  457. )
  458. element = driver.find_element_by_id('sb_cb50')
  459. driver.implicitly_wait(9)
  460. ActionChains(driver).move_to_element(element).click(element).perform()
  461. time.sleep(1)
  462. driver.back()
  463. if driver.current_url == item_url:continue
  464. print(driver.current_url)
  465. try:
  466. output = process_web_request_start(driver, fid)
  467. if output != 0: break
  468. except:
  469. r = redis.Redis(host='db.ptt.cx', port=6379, db=1,password='choozmo9')
  470. msg=traceback.format_exc()
  471. r.set('google_error',msg)
  472. # reivews
  473. print('parsing reviews....')
  474. print("parsing reviews.....{}".format(datetime.now()))
  475. if not output['user_ratings_total']:
  476. output['reviews'] = ''
  477. else:
  478. for i in range(3):
  479. print('reviews try...{}'.format(i))
  480. print("reviews try.....{}".format(datetime.now()))
  481. try:
  482. wait = WebDriverWait(driver, 30)
  483. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  484. wait.until(
  485. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  486. )
  487. element = driver.find_element_by_css_selector(more_reviews_css)
  488. driver.implicitly_wait(10)
  489. ActionChains(driver).move_to_element(element).click(element).perform()
  490. time.sleep(0.5)
  491. output_ = process_web_request_reviews(driver, output, output['ludocid'])
  492. if output_ != 0:
  493. output = output_
  494. break
  495. except:
  496. driver.get(item_url)
  497. time.sleep(0.5)
  498. # if 'reviews' not in output.keys():
  499. # continue
  500. # photo
  501. print('parsing photo....')
  502. if output['header_image'] != '':
  503. for i in range(3):
  504. print('photo try...{}'.format(i))
  505. print("photo try......{}".format(datetime.now()))
  506. driver.get(item_url)
  507. time.sleep(0.5)
  508. print(driver.current_url)
  509. try:
  510. wait = WebDriverWait(driver, 30)
  511. wait.until(
  512. EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
  513. )
  514. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  515. ActionChains(driver).move_to_element(element).click(element).perform()
  516. output = process_web_request_photo(driver, output, fid)
  517. break
  518. except:
  519. pass
  520. else:
  521. output['shop_photo'] = '[]'
  522. output['menu_photo'] = '[]'
  523. output['item_url'] = item_url
  524. output['keyword'] = keyword
  525. if output['google_url'] == '':
  526. query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
  527. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  528. output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  529. print(output)
  530. save_js_to_db(output, fid)
  531. print("save_js_to_db......{}".format(datetime.now()))
  532. error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
  533. print('*'*10)
  534. except TimeoutException as e:
  535. traceback.print_exc()
  536. break
  537. except TimeoutException as e:
  538. traceback.print_exc()
  539. break
  540. except:
  541. r = redis.Redis(host='db.ptt.cx', port=6379, db=1,password='choozmo9')
  542. msg=traceback.format_exc()
  543. r.set('google_error',msg)
  544. error_table3 = db['error_list3']
  545. error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  546. traceback.print_exc()
  547. # sys.exit()
  548. if __name__ == '__main__':
  549. main()