run5.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. #from tkinter.tix import TEXT
  4. from seleniumwire import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.support.wait import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.common.exceptions import TimeoutException
  11. from selenium.common.exceptions import WebDriverException
  12. import selenium
  13. import traceback
  14. from bs4 import BeautifulSoup
  15. import gzip
  16. from utility import database_access as DA
  17. from utility.parseutils import *
  18. from utility.connect import *
  19. import redis
  20. from datetime import datetime
  21. from requests import session
  22. import pandas as pd
  23. import dataset
  24. import time
  25. import json
  26. import re
  27. import sys, os
  28. import socket
  29. import brotli
  30. import pickle
  31. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  32. import urllib.parse
  33. chrome_window=False
  34. globalkw=None
  35. proxyport=8787
  36. def write_to_file(jsobj,fname):
  37. with open(fname, 'wb') as handle:
  38. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  39. def build_cache(db):
  40. global reviews_table
  41. id_dict={}
  42. cursor = db.query('SELECT fid FROM google_poi.shop_list4;')
  43. for c in cursor:
  44. key = '{}'.format(c['fid'])
  45. id_dict[key]=1
  46. return id_dict
  47. def brower_start(port):
  48. options = webdriver.ChromeOptions()
  49. # browser = webdriver.Chrome(options=options)
  50. options.add_argument('--ignore-certificate-errors')
  51. options.add_argument("--no-sandbox")
  52. options.add_argument("--headless")
  53. options.add_argument("--disable-gpu")
  54. options.add_argument("--disable-dev-shm-usage")
  55. browser = webdriver.Chrome(options=options)
  56. browser.set_window_size(1400,1000)
  57. # browser = webdriver.Remote(
  58. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  59. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  60. # desired_capabilities=options.to_capabilities()
  61. # )
  62. return browser
  63. def get_next_job(db):
  64. location_list = pd.read_csv('HKS須重爬店家.csv')
  65. result = {}
  66. result = db.query('SELECT * FROM progress_list WHERE check_ = 1')
  67. url_pd = pd.DataFrame([dict(i) for i in result])
  68. location_list = location_list[~location_list['分店編號'].isin(url_pd['id_'].to_list())]
  69. location_list = location_list.sample(500)
  70. return location_list
  71. def parsing_js(resp):
  72. txt = json.loads(resp[5::])
  73. output = {}
  74. output['name'] = txt[6][11]
  75. output['adress_name'] = txt[6][18]
  76. output['fid'] = txt[6][10]
  77. if txt[6][4]:
  78. if txt[6][4][7]:
  79. output['rating'] = str(txt[6][4][7])
  80. else:
  81. output['rating'] = None
  82. if txt[6][4][8]:
  83. output['user_ratings_total'] = str(txt[6][4][8])
  84. else:
  85. output['user_ratings_total'] = None
  86. if txt[6][4][2]:
  87. output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
  88. else:
  89. output['price_level'] = None
  90. else:
  91. output['rating'] = None
  92. output['user_ratings_total'] = None
  93. output['price_level'] = None
  94. if txt[6][37][0]:
  95. output['lon'] = txt[6][37][0][0][8][0][1]
  96. output['lat'] = txt[6][37][0][0][8][0][2]
  97. else:
  98. output['lon'] = None
  99. output['lat'] = None
  100. if txt[6][178]:
  101. output['tel'] = txt[6][178][0][0]
  102. else:
  103. output['tel'] = ''
  104. if txt[6][13]:
  105. output['category'] = txt[6][13][0]
  106. else:
  107. output['category'] = ''
  108. try:
  109. location = txt[6][183][2][2][0]
  110. if location:
  111. location_s = location.split(' ')
  112. output['city'], output['area'] = location_s[-1], location_s[-2]
  113. else:
  114. output['city'], output['area'] = '', ''
  115. except:
  116. output['city'], output['area'] = '', ''
  117. if txt[6][100]:
  118. for item in txt[6][100][1]:
  119. name = item[1]
  120. if name not in intro_list.keys(): continue
  121. name_map = intro_list[name]
  122. c = 0
  123. detail = []
  124. for t in item[2]:
  125. value = t[1]
  126. if t[3] == 1:
  127. detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
  128. else:
  129. detail += [{'id':c, name_map[1]:value}]
  130. c += 1
  131. output[name_map[0]] = str(detail)
  132. for key in intro_list:
  133. if intro_list[key][0] not in output.keys():
  134. output[intro_list[key][0]] = '[]'
  135. if txt[6][34]:
  136. output = time_parsing_js(txt[6][34], output)
  137. else:
  138. output['open_now'] = 'False'
  139. output['periods'] = ''
  140. output['weekday_text'] = ''
  141. output['time_status'] = ''
  142. if txt[6][72]:
  143. output['header_image'] = txt[6][72][0][0][6][0]
  144. else:
  145. output['header_image'] = ''
  146. if txt[6][126]:
  147. output['google_url'] = txt[6][126][4]
  148. ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1]
  149. if len(ludocid_str) != 0:
  150. ludocid = ludocid_str[0].split('=')[-1]
  151. output['ludocid'] = ludocid
  152. else:
  153. output['google_url'] = ''
  154. # write_to_file(orig,'debug.pickle')
  155. return output
  156. def time_parsing_js(time_json, output):
  157. weekday_text = []
  158. periods = []
  159. if time_json is None:
  160. output['open_now'] = 'False'
  161. output['periods'] = ''
  162. output['weekday_text'] = ''
  163. output['time_status'] = ''
  164. return output
  165. if time_json[1] is None:
  166. output['open_now'] = 'False'
  167. output['periods'] = ''
  168. output['weekday_text'] = ''
  169. output['time_status'] = ''
  170. return output
  171. for time_ in time_json[1]:
  172. week = time_[0]
  173. weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
  174. for t in time_[1]:
  175. if t == '24 小時營業':
  176. periods += [{
  177. "open":{
  178. "day": week_list[week],
  179. "time": '0000'
  180. },
  181. "close":{
  182. "day": week_list[week],
  183. "time": ''
  184. }
  185. }]
  186. elif t == '休息':
  187. periods += [{
  188. "open":{
  189. "day": week_list[week],
  190. "time": ''
  191. },
  192. "close":{
  193. "day": week_list[week],
  194. "time": ''
  195. }
  196. }]
  197. else:
  198. start, end = t.split('–')
  199. end_hour, end_min = end.split(':')
  200. start_hour, start_min = start.split(':')
  201. if end_hour < start_hour:
  202. end_day = week_list[week] + 1
  203. else:
  204. end_day = week_list[week]
  205. periods += [{
  206. "open":{
  207. "day": week_list[week],
  208. "time": start.replace(':','')
  209. },
  210. "close":{
  211. "day": end_day,
  212. "time": end.replace(':','')
  213. }
  214. }]
  215. output['periods'] = str(periods)
  216. output['weekday_text'] = str(weekday_text)
  217. output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
  218. if output['time_status'].find('永久停業') != -1 or\
  219. output['time_status'].find('暫時關閉') != -1 or\
  220. output['time_status'].find('暫停營業') != -1:
  221. output['open_now'] = 'False'
  222. else:
  223. output['open_now'] = 'True'
  224. return output
  225. def save_js_to_db(jsobj, fid):
  226. global shop_table
  227. global iddict
  228. if iddict.get(fid) is None:
  229. try:
  230. shop_table.insert(jsobj)
  231. except:
  232. traceback.print_exc()
  233. def process_web_request_start(driver):
  234. time.sleep(3)
  235. print("start&**********************")
  236. for request in driver.requests:
  237. if request.response:
  238. # print(request.url)
  239. if 'place?' in request.url :
  240. print(request.url)
  241. resp=request.response.body
  242. if 'gzip' in request.response.headers.get('Content-Encoding'):
  243. resp = gzip.decompress(request.response.body)
  244. if 'br' in request.response.headers.get('Content-Encoding'):
  245. resp = brotli.decompress(request.response.body)
  246. jstext = resp.decode('utf-8')
  247. output = parsing_js(jstext)
  248. time.sleep(1)
  249. del driver.requests
  250. return output
  251. del driver.requests
  252. return 0
  253. def reviews_parsing_js(resp):
  254. columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
  255. 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
  256. jsobj = json.loads(resp[5::])
  257. result = []
  258. for i in range(len(jsobj[2])):
  259. tmp = []
  260. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  261. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  262. # image
  263. image = []
  264. if jsobj[2][i][14]:
  265. for j in range(len(jsobj[2][i][14])):
  266. image += [jsobj[2][i][14][j][6][0]]
  267. tmp += [image]
  268. #rating
  269. tmp += [jsobj[2][i][4]]
  270. # store reply
  271. if jsobj[2][i][9]:
  272. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  273. else:
  274. tmp += ['', '']
  275. result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
  276. return result
  277. def process_web_request_reviews(driver, output):
  278. time.sleep(3)
  279. print("reviews&**********************")
  280. for request in driver.requests:
  281. if request.response:
  282. if 'listentitiesreviews?' in request.url :
  283. print(request.url)
  284. resp=request.response.body
  285. if 'gzip' in request.response.headers.get('Content-Encoding'):
  286. resp = gzip.decompress(request.response.body)
  287. if 'br' in request.response.headers.get('Content-Encoding'):
  288. resp = brotli.decompress(request.response.body)
  289. jstext = resp.decode('utf-8')
  290. result = reviews_parsing_js(jstext)
  291. output['reviews'] = str(result)
  292. time.sleep(1)
  293. del driver.requests
  294. return output
  295. del driver.requests
  296. return 0
  297. def photos_parsing_js(resp):
  298. def image_url_change_size(url):
  299. if url.find('streetviewpixels') != -1:
  300. return url
  301. else:
  302. url_split = url.split('=')
  303. new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
  304. return new_url
  305. jsobj = json.loads(resp[5::])
  306. # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
  307. menu = []
  308. all = []
  309. photo_category_map = {}
  310. for row in jsobj[12][0]:
  311. photo_category_map[row[0]] = row[2]
  312. if photo_category_map[jsobj[13][0]] == '全部':
  313. for img in jsobj[0]:
  314. all += [image_url_change_size(img[6][0])]
  315. elif photo_category_map[jsobj[13][0]] == '菜單':
  316. for img in jsobj[0]:
  317. menu += [image_url_change_size(img[6][0])]
  318. return list(set(menu)), list(set(all))
  319. def process_web_request_photo(driver, output, fid):
  320. try:
  321. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
  322. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  323. tab_dict = {}
  324. for tab_index in [0, 1, 2]:
  325. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  326. if len(selector) != 0:
  327. photo_name = selector[0].text
  328. if photo_name == '菜單':
  329. tab_dict[photo_name] = tab_index
  330. elif photo_name == '全部':
  331. tab_dict[photo_name] = tab_index
  332. except:
  333. tab_dict = {}
  334. print(tab_dict)
  335. for tab_ in tab_dict:
  336. tab_index = tab_dict[tab_]
  337. print(tab_index)
  338. wait = WebDriverWait(driver, 60)
  339. wait.until(
  340. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  341. )
  342. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  343. ActionChains(driver).move_to_element(element).click(element).perform()
  344. time.sleep(1)
  345. print("photo&**********************")
  346. menu_list = []
  347. all_list = []
  348. for request in driver.requests:
  349. if request.response:
  350. # print(request.url)
  351. if 'photo?' in request.url :
  352. # print('parsing js:')
  353. front, _ = fid.split(':')
  354. if request.url.find(front) != -1:
  355. print(request.url)
  356. resp=request.response.body
  357. if 'gzip' in request.response.headers.get('Content-Encoding'):
  358. resp = gzip.decompress(request.response.body)
  359. if 'br' in request.response.headers.get('Content-Encoding'):
  360. resp = brotli.decompress(request.response.body)
  361. jstext = resp.decode('utf-8')
  362. menu, all = photos_parsing_js(jstext)
  363. menu_list += menu
  364. all_list += all
  365. del driver.requests
  366. output['shop_photo'] = str(all_list[:5])
  367. output['menu_photo'] = str(menu_list[:5])
  368. del driver.requests
  369. return output
  370. def main():
  371. global chrome_window
  372. global store_list_table
  373. global shop_table
  374. global proxyport
  375. global iddict
  376. localip=socket.gethostbyname(socket.gethostname())
  377. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  378. # store_list_table = db['swire_store_list']
  379. shop_table = db['shop_list4']
  380. progress_table = db['progress_list']
  381. iddict=build_cache(db)
  382. # print("iddict...{}".format(datetime.now()))
  383. port=4444
  384. if len(sys.argv) == 3 :
  385. port=int(sys.argv[1])
  386. proxyport=int(sys.argv[2])
  387. if not chrome_window:
  388. print('restart docker pw{}'.format(port))
  389. os.system('sudo docker container restart pw'+str(port))
  390. # os.system('sudo docker container restart pw'+str(port))
  391. time.sleep(5)
  392. print('drvier start...')
  393. driver = brower_start(port)
  394. job = get_next_job(db)
  395. c = 0
  396. for row, group in job.iterrows():
  397. try:
  398. print(row)
  399. keyword = group['分店'] + group['地址']
  400. item_url = 'https://www.google.com/maps/place/?q={}'.format(keyword)
  401. print(item_url)
  402. #shop_info
  403. print('parsing shop info....')
  404. for i in range(5):
  405. print('shop info try...{}'.format(i))
  406. print("shop info try...{}".format(datetime.now()))
  407. driver.get(item_url)
  408. time.sleep(3)
  409. element = driver.find_elements_by_css_selector('div[role="article"]')
  410. if len(element) != 0:
  411. item_url = element[0].find_element_by_css_selector('a').get_attribute('href')
  412. print(item_url)
  413. driver.get(item_url)
  414. time.sleep(3)
  415. wait = WebDriverWait(driver, 10)
  416. wait.until(
  417. EC.element_to_be_clickable((By.ID, 'sb_cb50'))
  418. )
  419. element = driver.find_element_by_id('sb_cb50')
  420. driver.implicitly_wait(9)
  421. ActionChains(driver).move_to_element(element).click(element).perform()
  422. time.sleep(1)
  423. driver.back()
  424. if driver.current_url == item_url:continue
  425. print(driver.current_url)
  426. output = process_web_request_start(driver)
  427. if output != 0: break
  428. print(output)
  429. # reivews
  430. print('parsing reviews....')
  431. print("parsing reviews.....{}".format(datetime.now()))
  432. if not output['user_ratings_total']:
  433. output['reviews'] = ''
  434. else:
  435. for i in range(3):
  436. print('reviews try...{}'.format(i))
  437. print("reviews try.....{}".format(datetime.now()))
  438. wait = WebDriverWait(driver, 30)
  439. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  440. wait.until(
  441. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  442. )
  443. element = driver.find_element_by_css_selector(more_reviews_css)
  444. driver.implicitly_wait(10)
  445. ActionChains(driver).move_to_element(element).click(element).perform()
  446. time.sleep(0.5)
  447. output_ = process_web_request_reviews(driver, output)
  448. if output_ != 0:
  449. output = output_
  450. break
  451. else:
  452. driver.get(item_url)
  453. time.sleep(0.5)
  454. # photo
  455. print('parsing photo....')
  456. if output['header_image'] != '':
  457. for i in range(3):
  458. print('photo try...{}'.format(i))
  459. print("photo try......{}".format(datetime.now()))
  460. driver.get(item_url)
  461. time.sleep(0.5)
  462. print(driver.current_url)
  463. try:
  464. wait = WebDriverWait(driver, 30)
  465. wait.until(
  466. EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
  467. )
  468. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  469. ActionChains(driver).move_to_element(element).click(element).perform()
  470. output = process_web_request_photo(driver, output, output['fid'])
  471. break
  472. except:
  473. pass
  474. else:
  475. output['shop_photo'] = '[]'
  476. output['menu_photo'] = '[]'
  477. output['item_url'] = item_url
  478. output['keyword'] = keyword
  479. if output['google_url'] == '':
  480. query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
  481. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  482. output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  483. print(output)
  484. save_js_to_db(output, output['fid'])
  485. print("save_js_to_db......{}".format(datetime.now()))
  486. progress_table.insert({'id_':group['分店編號'],
  487. 'name':output['name'],
  488. 'fid':output['fid'],
  489. 'check_':1})
  490. except TimeoutException as e:
  491. traceback.print_exc()
  492. continue
  493. except:
  494. progress_table.insert({'id_':group['分店編號'],
  495. 'name':output['name'],
  496. 'fid':output['fid'],
  497. 'check_':0})
  498. if __name__ == '__main__':
  499. main()