run4.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. #from tkinter.tix import TEXT
  4. from seleniumwire import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.support.wait import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.common.exceptions import TimeoutException
  11. from selenium.common.exceptions import WebDriverException
  12. import selenium
  13. import traceback
  14. from bs4 import BeautifulSoup
  15. import gzip
  16. from utility import database_access as DA
  17. from utility.parseutils import *
  18. from utility.connect import *
  19. from datetime import datetime
  20. from requests import session
  21. import pandas as pd
  22. import dataset
  23. import time
  24. import json
  25. import re
  26. import sys, os
  27. import socket
  28. import brotli
  29. import pickle
  30. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  31. import urllib.parse
  32. chrome_window=False
  33. globalkw=None
  34. proxyport=8787
  35. def write_to_file(jsobj,fname):
  36. with open(fname, 'wb') as handle:
  37. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  38. def build_cache(db):
  39. global reviews_table
  40. id_dict={}
  41. cursor = db.query('SELECT fid FROM google_poi.shop_list3;')
  42. for c in cursor:
  43. key = '{}'.format(c['fid'])
  44. id_dict[key]=1
  45. return id_dict
  46. #def brower_start(port):
  47. # global proxyport
  48. # global chrome_window
  49. # print(proxyport)
  50. # options = webdriver.ChromeOptions()
  51. # if chrome_window:
  52. # browser = webdriver.Chrome(
  53. # desired_capabilities=options.to_capabilities()
  54. # )
  55. # else:
  56. # chrome_options = webdriver.ChromeOptions()
  57. # chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  58. # chrome_options.add_argument('--ignore-certificate-errors')
  59. # chrome_options.add_argument("--no-sandbox")
  60. # chrome_options.add_argument("--disable-dev-shm-usage")
  61. # browser = webdriver.Remote(
  62. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  63. # desired_capabilities=chrome_options.to_capabilities(),
  64. # seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  65. # )
  66. # browser.set_window_size(1400,1000)
  67. # return browser
  68. def brower_start(port):
  69. options = webdriver.ChromeOptions()
  70. # browser = webdriver.Chrome(options=options)
  71. options.add_argument('--ignore-certificate-errors')
  72. options.add_argument("--no-sandbox")
  73. options.add_argument("--headless")
  74. options.add_argument("--disable-gpu")
  75. options.add_argument("--disable-dev-shm-usage")
  76. browser = webdriver.Chrome(options=options)
  77. browser.set_window_size(1400,1000)
  78. # browser = webdriver.Remote(
  79. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  80. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  81. # desired_capabilities=options.to_capabilities()
  82. # )
  83. return browser
  84. def get_next_job(db):
  85. result = {}
  86. # result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 100')
  87. result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 30')
  88. url_pd = pd.DataFrame([dict(i) for i in result])
  89. url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
  90. # url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
  91. # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  92. remove = db.query('select item_url from error_list3')
  93. remove = pd.DataFrame([dict(i) for i in remove])
  94. if len(remove) != 0:
  95. remove_fid_list = remove['item_url'].to_list()
  96. url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
  97. return url_pd
  98. def parsing_js(resp):
  99. txt = json.loads(resp[5::])
  100. output = {}
  101. output['name'] = txt[6][11]
  102. output['adress_name'] = txt[6][18]
  103. if txt[6][4]:
  104. if txt[6][4][7]:
  105. output['rating'] = str(txt[6][4][7])
  106. else:
  107. output['rating'] = None
  108. if txt[6][4][8]:
  109. output['user_ratings_total'] = str(txt[6][4][8])
  110. else:
  111. output['user_ratings_total'] = None
  112. if txt[6][4][2]:
  113. output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
  114. else:
  115. output['price_level'] = None
  116. else:
  117. output['rating'] = None
  118. output['user_ratings_total'] = None
  119. output['price_level'] = None
  120. if txt[6][37][0]:
  121. output['lon'] = txt[6][37][0][0][8][0][1]
  122. output['lat'] = txt[6][37][0][0][8][0][2]
  123. else:
  124. output['lon'] = None
  125. output['lat'] = None
  126. if txt[6][178]:
  127. output['tel'] = txt[6][178][0][0]
  128. else:
  129. output['tel'] = ''
  130. if txt[6][13]:
  131. output['category'] = txt[6][13][0]
  132. else:
  133. output['category'] = ''
  134. try:
  135. location = txt[6][183][2][2][0]
  136. if location:
  137. location_s = location.split(' ')
  138. output['city'], output['area'] = location_s[-1], location_s[-2]
  139. else:
  140. output['city'], output['area'] = '', ''
  141. except:
  142. output['city'], output['area'] = '', ''
  143. if txt[6][100]:
  144. for item in txt[6][100][1]:
  145. name = item[1]
  146. if name not in intro_list.keys(): continue
  147. name_map = intro_list[name]
  148. c = 0
  149. detail = []
  150. for t in item[2]:
  151. value = t[1]
  152. if t[3] == 1:
  153. detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
  154. else:
  155. detail += [{'id':c, name_map[1]:value}]
  156. c += 1
  157. output[name_map[0]] = str(detail)
  158. for key in intro_list:
  159. if intro_list[key][0] not in output.keys():
  160. output[intro_list[key][0]] = '[]'
  161. if txt[6][34]:
  162. output = time_parsing_js(txt[6][34], output)
  163. else:
  164. output['open_now'] = 'False'
  165. output['periods'] = ''
  166. output['weekday_text'] = ''
  167. output['time_status'] = ''
  168. if txt[6][72]:
  169. output['header_image'] = txt[6][72][0][0][6][0]
  170. else:
  171. output['header_image'] = ''
  172. if txt[6][126]:
  173. output['google_url'] = txt[6][126][4]
  174. ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1]
  175. if len(ludocid_str) != 0:
  176. ludocid = ludocid_str[0].split('=')[-1]
  177. output['ludocid'] = ludocid
  178. else:
  179. output['google_url'] = ''
  180. # write_to_file(orig,'debug.pickle')
  181. return output
  182. def time_parsing_js(time_json, output):
  183. weekday_text = []
  184. periods = []
  185. for time_ in time_json[1]:
  186. week = time_[0]
  187. weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
  188. for t in time_[1]:
  189. if t == '24 小時營業':
  190. periods += [{
  191. "open":{
  192. "day": week_list[week],
  193. "time": '0000'
  194. },
  195. "close":{
  196. "day": week_list[week],
  197. "time": ''
  198. }
  199. }]
  200. elif t == '休息':
  201. periods += [{
  202. "open":{
  203. "day": week_list[week],
  204. "time": ''
  205. },
  206. "close":{
  207. "day": week_list[week],
  208. "time": ''
  209. }
  210. }]
  211. else:
  212. start, end = t.split('–')
  213. end_hour, end_min = end.split(':')
  214. start_hour, start_min = start.split(':')
  215. if end_hour < start_hour:
  216. end_day = week_list[week] + 1
  217. else:
  218. end_day = week_list[week]
  219. periods += [{
  220. "open":{
  221. "day": week_list[week],
  222. "time": start.replace(':','')
  223. },
  224. "close":{
  225. "day": end_day,
  226. "time": end.replace(':','')
  227. }
  228. }]
  229. output['periods'] = str(periods)
  230. output['weekday_text'] = str(weekday_text)
  231. output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
  232. if output['time_status'].find('永久停業') != -1 or\
  233. output['time_status'].find('暫時關閉') != -1 or\
  234. output['time_status'].find('暫停營業') != -1:
  235. output['open_now'] = 'False'
  236. else:
  237. output['open_now'] = 'True'
  238. return output
  239. def save_js_to_db(jsobj, fid):
  240. global shop_table
  241. global iddict
  242. jsobj['fid'] = fid
  243. if iddict.get(fid) is None:
  244. try:
  245. shop_table.insert(jsobj)
  246. except:
  247. traceback.print_exc()
  248. def process_web_request_start(driver, fid):
  249. time.sleep(3)
  250. print("start&**********************")
  251. for request in driver.requests:
  252. if request.response:
  253. # print(request.url)
  254. if 'place?' in request.url :
  255. # print('parsing js:')
  256. front, _ = fid.split(':')
  257. if request.url.find(front) != -1:
  258. print(request.url)
  259. # resp = brotli.decompress(request.response.body)
  260. resp=request.response.body
  261. if 'gzip' in request.response.headers.get('Content-Encoding'):
  262. resp = gzip.decompress(request.response.body)
  263. if 'br' in request.response.headers.get('Content-Encoding'):
  264. resp = brotli.decompress(request.response.body)
  265. # resp = brotli.decompress(request.response.body)
  266. jstext = resp.decode('utf-8')
  267. output = parsing_js(jstext)
  268. time.sleep(1)
  269. return output
  270. return 0
  271. def reviews_parsing_js(resp):
  272. columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
  273. 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
  274. jsobj = json.loads(resp[5::])
  275. result = []
  276. for i in range(len(jsobj[2])):
  277. tmp = []
  278. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  279. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  280. # image
  281. image = []
  282. if jsobj[2][i][14]:
  283. for j in range(len(jsobj[2][i][14])):
  284. image += [jsobj[2][i][14][j][6][0]]
  285. tmp += [image]
  286. #rating
  287. tmp += [jsobj[2][i][4]]
  288. # store reply
  289. if jsobj[2][i][9]:
  290. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  291. else:
  292. tmp += ['', '']
  293. result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
  294. return result
  295. def process_web_request_reviews(driver, output, ludocid):
  296. time.sleep(3)
  297. print("reviews&**********************")
  298. for request in driver.requests:
  299. if request.response:
  300. # print(request.url)
  301. if 'listentitiesreviews?' in request.url :
  302. # print('parsing js:')
  303. if request.url.find(ludocid) != -1:
  304. print(request.url)
  305. # resp = brotli.decompress(request.response.body)
  306. # jstext = resp.decode('utf-8')
  307. # result = reviews_parsing_js(jstext)
  308. # resp = brotli.decompress(request.response.body)
  309. resp=request.response.body
  310. if 'gzip' in request.response.headers.get('Content-Encoding'):
  311. resp = gzip.decompress(request.response.body)
  312. if 'br' in request.response.headers.get('Content-Encoding'):
  313. resp = brotli.decompress(request.response.body)
  314. # resp = brotli.decompress(request.response.body)
  315. jstext = resp.decode('utf-8')
  316. result = parsing_js(jstext)
  317. output['reviews'] = str(result)
  318. time.sleep(1)
  319. return output
  320. return 0
  321. def photos_parsing_js(resp):
  322. def image_url_change_size(url):
  323. if url.find('streetviewpixels') != -1:
  324. return url
  325. else:
  326. url_split = url.split('=')
  327. new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
  328. return new_url
  329. jsobj = json.loads(resp[5::])
  330. # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
  331. menu = []
  332. all = []
  333. photo_category_map = {}
  334. for row in jsobj[12][0]:
  335. photo_category_map[row[0]] = row[2]
  336. if photo_category_map[jsobj[13][0]] == '全部':
  337. for img in jsobj[0][:5]:
  338. all += [image_url_change_size(img[6][0])]
  339. elif photo_category_map[jsobj[13][0]] == '菜單':
  340. for img in jsobj[0][:5]:
  341. menu += [image_url_change_size(img[6][0])]
  342. return menu, all
  343. def process_web_request_photo(driver, output, fid):
  344. try:
  345. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
  346. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  347. tab_dict = {}
  348. for tab_index in [0, 1, 2]:
  349. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  350. if len(selector) != 0:
  351. photo_name = selector[0].text
  352. if photo_name == '菜單':
  353. tab_dict[photo_name] = tab_index
  354. elif photo_name == '全部':
  355. tab_dict[photo_name] = tab_index
  356. except:
  357. tab_dict = {}
  358. print(tab_dict)
  359. for tab_ in tab_dict:
  360. tab_index = tab_dict[tab_]
  361. print(tab_index)
  362. wait = WebDriverWait(driver, 60)
  363. wait.until(
  364. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  365. )
  366. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  367. ActionChains(driver).move_to_element(element).click(element).perform()
  368. time.sleep(1)
  369. print("photo&**********************")
  370. menu_list = []
  371. all_list = []
  372. for request in driver.requests:
  373. if request.response:
  374. # print(request.url)
  375. if 'photo?' in request.url :
  376. # print('parsing js:')
  377. front, _ = fid.split(':')
  378. if request.url.find(front) != -1:
  379. print(request.url)
  380. resp = brotli.decompress(request.response.body)
  381. jstext = resp.decode('utf-8')
  382. menu, all = photos_parsing_js(jstext)
  383. menu_list += menu
  384. all_list += all
  385. output['shop_photo'] = str(all_list[:5])
  386. output['menu_photo'] = str(menu_list[:5])
  387. return output
  388. def main():
  389. global chrome_window
  390. global store_list_table
  391. global shop_table
  392. global proxyport
  393. global iddict
  394. localip=socket.gethostbyname(socket.gethostname())
  395. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  396. store_list_table = db['swire_store_list']
  397. shop_table = db['shop_list3']
  398. error_table = db['error_list2']
  399. iddict=build_cache(db)
  400. print("iddict...{}".format(datetime.now()))
  401. port=4444
  402. if len(sys.argv) == 3 :
  403. port=int(sys.argv[1])
  404. proxyport=int(sys.argv[2])
  405. if not chrome_window:
  406. print('restart docker pw{}'.format(port))
  407. # os.system('sudo docker container restart p'+str(port))
  408. # os.system('sudo docker container restart pw'+str(port))
  409. # time.sleep(10)
  410. print('drvier start...')
  411. driver = brower_start(port)
  412. job = get_next_job(db)
  413. c = 0
  414. for row, group in job.iterrows():
  415. try:
  416. item_url = group['item_url']
  417. name = group['name']
  418. num = group['num']
  419. keyword = group['keyword']
  420. fid = group['fid']
  421. if name:
  422. db_name = name
  423. else:
  424. db_name = num
  425. print(fid, keyword, db_name)
  426. print(item_url)
  427. #shop_info
  428. print('parsing shop info....')
  429. for i in range(5):
  430. print('shop info try...{}'.format(i))
  431. print("shop info try...{}".format(datetime.now()))
  432. driver.get(item_url)
  433. time.sleep(3)
  434. wait = WebDriverWait(driver, 10)
  435. wait.until(
  436. EC.element_to_be_clickable((By.ID, 'sb_cb50'))
  437. )
  438. element = driver.find_element_by_id('sb_cb50')
  439. driver.implicitly_wait(9)
  440. ActionChains(driver).move_to_element(element).click(element).perform()
  441. time.sleep(1)
  442. driver.back()
  443. if driver.current_url == item_url:continue
  444. print(driver.current_url)
  445. output = process_web_request_start(driver, fid)
  446. if output != 0: break
  447. # reivews
  448. print('parsing reviews....')
  449. print("parsing reviews.....{}".format(datetime.now()))
  450. if not output['user_ratings_total']:
  451. output['reviews'] = ''
  452. else:
  453. for i in range(3):
  454. print('reviews try...{}'.format(i))
  455. print("reviews try.....{}".format(datetime.now()))
  456. try:
  457. wait = WebDriverWait(driver, 30)
  458. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  459. wait.until(
  460. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  461. )
  462. element = driver.find_element_by_css_selector(more_reviews_css)
  463. driver.implicitly_wait(10)
  464. ActionChains(driver).move_to_element(element).click(element).perform()
  465. time.sleep(0.5)
  466. output_ = process_web_request_reviews(driver, output, output['ludocid'])
  467. if output_ != 0:
  468. output = output_
  469. break
  470. except:
  471. driver.get(item_url)
  472. time.sleep(0.5)
  473. if 'reviews' not in output.keys():
  474. continue
  475. # photo
  476. print('parsing photo....')
  477. if output['header_image'] != '':
  478. for i in range(3):
  479. print('photo try...{}'.format(i))
  480. print("photo try......{}".format(datetime.now()))
  481. driver.get(item_url)
  482. time.sleep(0.5)
  483. print(driver.current_url)
  484. try:
  485. wait = WebDriverWait(driver, 30)
  486. wait.until(
  487. EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
  488. )
  489. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  490. ActionChains(driver).move_to_element(element).click(element).perform()
  491. output = process_web_request_photo(driver, output, fid)
  492. break
  493. except:
  494. pass
  495. else:
  496. output['shop_photo'] = '[]'
  497. output['menu_photo'] = '[]'
  498. output['item_url'] = item_url
  499. output['keyword'] = keyword
  500. if output['google_url'] == '':
  501. query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
  502. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  503. output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  504. print(output)
  505. save_js_to_db(output, fid)
  506. print("save_js_to_db......{}".format(datetime.now()))
  507. error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
  508. print('*'*10)
  509. except TimeoutException as e:
  510. traceback.print_exc()
  511. break
  512. except TimeoutException as e:
  513. traceback.print_exc()
  514. break
  515. except:
  516. error_table3 = db['error_list3']
  517. error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  518. traceback.print_exc()
  519. # sys.exit()
  520. if __name__ == '__main__':
  521. main()