run4.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. #from tkinter.tix import TEXT
  4. from seleniumwire import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.support.wait import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.common.exceptions import TimeoutException
  11. from selenium.common.exceptions import WebDriverException
  12. import selenium
  13. import traceback
  14. from bs4 import BeautifulSoup
  15. from utility import database_access as DA
  16. from utility.parseutils import *
  17. from utility.connect import *
  18. from datetime import datetime
  19. from requests import session
  20. import pandas as pd
  21. import dataset
  22. import time
  23. import json
  24. import re
  25. import sys, os
  26. import socket
  27. import brotli
  28. import pickle
  29. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  30. import urllib.parse
  31. chrome_window=False
  32. globalkw=None
  33. proxyport=8787
  34. def write_to_file(jsobj,fname):
  35. with open(fname, 'wb') as handle:
  36. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  37. def build_cache(db):
  38. global reviews_table
  39. id_dict={}
  40. cursor = db.query('SELECT fid FROM google_poi.shop_list3;')
  41. for c in cursor:
  42. key = '{}'.format(c['fid'])
  43. id_dict[key]=1
  44. return id_dict
  45. #def brower_start(port):
  46. # global proxyport
  47. # global chrome_window
  48. # print(proxyport)
  49. # options = webdriver.ChromeOptions()
  50. # if chrome_window:
  51. # browser = webdriver.Chrome(
  52. # desired_capabilities=options.to_capabilities()
  53. # )
  54. # else:
  55. # chrome_options = webdriver.ChromeOptions()
  56. # chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  57. # chrome_options.add_argument('--ignore-certificate-errors')
  58. # chrome_options.add_argument("--no-sandbox")
  59. # chrome_options.add_argument("--disable-dev-shm-usage")
  60. # browser = webdriver.Remote(
  61. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  62. # desired_capabilities=chrome_options.to_capabilities(),
  63. # seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  64. # )
  65. # browser.set_window_size(1400,1000)
  66. # return browser
  67. def brower_start(port):
  68. options = webdriver.ChromeOptions()
  69. # browser = webdriver.Chrome(options=options)
  70. options.add_argument('--ignore-certificate-errors')
  71. options.add_argument("--no-sandbox")
  72. options.add_argument("--headless")
  73. options.add_argument("--disable-gpu")
  74. options.add_argument("--disable-dev-shm-usage")
  75. browser = webdriver.Chrome(options=options)
  76. browser.set_window_size(1400,1000)
  77. # browser = webdriver.Remote(
  78. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  79. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  80. # desired_capabilities=options.to_capabilities()
  81. # )
  82. return browser
  83. def get_next_job(db):
  84. result = {}
  85. # result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 100')
  86. result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 100')
  87. url_pd = pd.DataFrame([dict(i) for i in result])
  88. url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
  89. # url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
  90. # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  91. remove = db.query('select item_url from error_list3')
  92. remove = pd.DataFrame([dict(i) for i in remove])
  93. if len(remove) != 0:
  94. remove_fid_list = remove['item_url'].to_list()
  95. url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
  96. return url_pd
  97. def parsing_js(resp):
  98. txt = json.loads(resp[5::])
  99. output = {}
  100. output['name'] = txt[6][11]
  101. output['adress_name'] = txt[6][18]
  102. if txt[6][4]:
  103. if txt[6][4][7]:
  104. output['rating'] = str(txt[6][4][7])
  105. else:
  106. output['rating'] = None
  107. if txt[6][4][8]:
  108. output['user_ratings_total'] = str(txt[6][4][8])
  109. else:
  110. output['user_ratings_total'] = None
  111. if txt[6][4][2]:
  112. output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
  113. else:
  114. output['price_level'] = None
  115. else:
  116. output['rating'] = None
  117. output['user_ratings_total'] = None
  118. output['price_level'] = None
  119. if txt[6][37][0]:
  120. output['lon'] = txt[6][37][0][0][8][0][1]
  121. output['lat'] = txt[6][37][0][0][8][0][2]
  122. else:
  123. output['lon'] = None
  124. output['lat'] = None
  125. if txt[6][178]:
  126. output['tel'] = txt[6][178][0][0]
  127. else:
  128. output['tel'] = ''
  129. if txt[6][13]:
  130. output['category'] = txt[6][13][0]
  131. else:
  132. output['category'] = ''
  133. try:
  134. location = txt[6][183][2][2][0]
  135. if location:
  136. location_s = location.split(' ')
  137. output['city'], output['area'] = location_s[-1], location_s[-2]
  138. else:
  139. output['city'], output['area'] = '', ''
  140. except:
  141. output['city'], output['area'] = '', ''
  142. if txt[6][100]:
  143. for item in txt[6][100][1]:
  144. name = item[1]
  145. if name not in intro_list.keys(): continue
  146. name_map = intro_list[name]
  147. c = 0
  148. detail = []
  149. for t in item[2]:
  150. value = t[1]
  151. if t[3] == 1:
  152. detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
  153. else:
  154. detail += [{'id':c, name_map[1]:value}]
  155. c += 1
  156. output[name_map[0]] = str(detail)
  157. for key in intro_list:
  158. if intro_list[key][0] not in output.keys():
  159. output[intro_list[key][0]] = '[]'
  160. if txt[6][34]:
  161. output = time_parsing_js(txt[6][34], output)
  162. else:
  163. output['open_now'] = 'False'
  164. output['periods'] = ''
  165. output['weekday_text'] = ''
  166. output['time_status'] = ''
  167. if txt[6][72]:
  168. output['header_image'] = txt[6][72][0][0][6][0]
  169. else:
  170. output['header_image'] = ''
  171. if txt[6][126]:
  172. output['google_url'] = txt[6][126][4]
  173. ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1]
  174. if len(ludocid_str) != 0:
  175. ludocid = ludocid_str[0].split('=')[-1]
  176. output['ludocid'] = ludocid
  177. else:
  178. output['google_url'] = ''
  179. # write_to_file(orig,'debug.pickle')
  180. return output
  181. def time_parsing_js(time_json, output):
  182. weekday_text = []
  183. periods = []
  184. for time_ in time_json[1]:
  185. week = time_[0]
  186. weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
  187. for t in time_[1]:
  188. if t == '24 小時營業':
  189. periods += [{
  190. "open":{
  191. "day": week_list[week],
  192. "time": '0000'
  193. },
  194. "close":{
  195. "day": week_list[week],
  196. "time": ''
  197. }
  198. }]
  199. elif t == '休息':
  200. periods += [{
  201. "open":{
  202. "day": week_list[week],
  203. "time": ''
  204. },
  205. "close":{
  206. "day": week_list[week],
  207. "time": ''
  208. }
  209. }]
  210. else:
  211. start, end = t.split('–')
  212. end_hour, end_min = end.split(':')
  213. start_hour, start_min = start.split(':')
  214. if end_hour < start_hour:
  215. end_day = week_list[week] + 1
  216. else:
  217. end_day = week_list[week]
  218. periods += [{
  219. "open":{
  220. "day": week_list[week],
  221. "time": start.replace(':','')
  222. },
  223. "close":{
  224. "day": end_day,
  225. "time": end.replace(':','')
  226. }
  227. }]
  228. output['periods'] = str(periods)
  229. output['weekday_text'] = str(weekday_text)
  230. output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
  231. if output['time_status'].find('永久停業') != -1 or\
  232. output['time_status'].find('暫時關閉') != -1 or\
  233. output['time_status'].find('暫停營業') != -1:
  234. output['open_now'] = 'False'
  235. else:
  236. output['open_now'] = 'True'
  237. return output
  238. def save_js_to_db(jsobj, fid):
  239. global shop_table
  240. global iddict
  241. jsobj['fid'] = fid
  242. if iddict.get(fid) is None:
  243. try:
  244. shop_table.insert(jsobj)
  245. except:
  246. traceback.print_exc()
  247. def process_web_request_start(driver, fid):
  248. time.sleep(3)
  249. print("start&**********************")
  250. for request in driver.requests:
  251. if request.response:
  252. # print(request.url)
  253. if 'place?' in request.url :
  254. # print('parsing js:')
  255. front, _ = fid.split(':')
  256. if request.url.find(front) != -1:
  257. print(request.url)
  258. resp = brotli.decompress(request.response.body)
  259. jstext = resp.decode('utf-8')
  260. output = parsing_js(jstext)
  261. time.sleep(1)
  262. return output
  263. return 0
  264. def reviews_parsing_js(resp):
  265. columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
  266. 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
  267. jsobj = json.loads(resp[5::])
  268. result = []
  269. for i in range(len(jsobj[2])):
  270. tmp = []
  271. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  272. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  273. # image
  274. image = []
  275. if jsobj[2][i][14]:
  276. for j in range(len(jsobj[2][i][14])):
  277. image += [jsobj[2][i][14][j][6][0]]
  278. tmp += [image]
  279. #rating
  280. tmp += [jsobj[2][i][4]]
  281. # store reply
  282. if jsobj[2][i][9]:
  283. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  284. else:
  285. tmp += ['', '']
  286. result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
  287. return result
  288. def process_web_request_reviews(driver, output, ludocid):
  289. time.sleep(3)
  290. print("reviews&**********************")
  291. for request in driver.requests:
  292. if request.response:
  293. # print(request.url)
  294. if 'listentitiesreviews?' in request.url :
  295. # print('parsing js:')
  296. if request.url.find(ludocid) != -1:
  297. print(request.url)
  298. resp = brotli.decompress(request.response.body)
  299. jstext = resp.decode('utf-8')
  300. result = reviews_parsing_js(jstext)
  301. output['reviews'] = str(result)
  302. time.sleep(1)
  303. return output
  304. return 0
  305. def photos_parsing_js(resp):
  306. def image_url_change_size(url):
  307. if url.find('streetviewpixels') != -1:
  308. return url
  309. else:
  310. url_split = url.split('=')
  311. new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
  312. return new_url
  313. jsobj = json.loads(resp[5::])
  314. # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
  315. menu = []
  316. all = []
  317. photo_category_map = {}
  318. for row in jsobj[12][0]:
  319. photo_category_map[row[0]] = row[2]
  320. if photo_category_map[jsobj[13][0]] == '全部':
  321. for img in jsobj[0][:5]:
  322. all += [image_url_change_size(img[6][0])]
  323. elif photo_category_map[jsobj[13][0]] == '菜單':
  324. for img in jsobj[0][:5]:
  325. menu += [image_url_change_size(img[6][0])]
  326. return menu, all
  327. def process_web_request_photo(driver, output, fid):
  328. try:
  329. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
  330. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  331. tab_dict = {}
  332. for tab_index in [0, 1, 2]:
  333. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  334. if len(selector) != 0:
  335. photo_name = selector[0].text
  336. if photo_name == '菜單':
  337. tab_dict[photo_name] = tab_index
  338. elif photo_name == '全部':
  339. tab_dict[photo_name] = tab_index
  340. except:
  341. tab_dict = {}
  342. print(tab_dict)
  343. for tab_ in tab_dict:
  344. tab_index = tab_dict[tab_]
  345. print(tab_index)
  346. wait = WebDriverWait(driver, 60)
  347. wait.until(
  348. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  349. )
  350. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  351. ActionChains(driver).move_to_element(element).click(element).perform()
  352. time.sleep(1)
  353. print("photo&**********************")
  354. menu_list = []
  355. all_list = []
  356. for request in driver.requests:
  357. if request.response:
  358. # print(request.url)
  359. if 'photo?' in request.url :
  360. # print('parsing js:')
  361. front, _ = fid.split(':')
  362. if request.url.find(front) != -1:
  363. print(request.url)
  364. resp = brotli.decompress(request.response.body)
  365. jstext = resp.decode('utf-8')
  366. menu, all = photos_parsing_js(jstext)
  367. menu_list += menu
  368. all_list += all
  369. output['shop_photo'] = str(all_list[:5])
  370. output['menu_photo'] = str(menu_list[:5])
  371. return output
  372. def main():
  373. global chrome_window
  374. global store_list_table
  375. global shop_table
  376. global proxyport
  377. global iddict
  378. localip=socket.gethostbyname(socket.gethostname())
  379. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  380. store_list_table = db['swire_store_list']
  381. shop_table = db['shop_list3']
  382. error_table = db['error_list2']
  383. iddict=build_cache(db)
  384. port=4444
  385. if len(sys.argv) == 3 :
  386. port=int(sys.argv[1])
  387. proxyport=int(sys.argv[2])
  388. if not chrome_window:
  389. print('restart docker pw{}'.format(port))
  390. # os.system('sudo docker container restart p'+str(port))
  391. # os.system('sudo docker container restart pw'+str(port))
  392. # time.sleep(10)
  393. print('drvier start...')
  394. driver = brower_start(port)
  395. job = get_next_job(db)
  396. c = 0
  397. for row, group in job.iterrows():
  398. try:
  399. item_url = group['item_url']
  400. name = group['name']
  401. num = group['num']
  402. keyword = group['keyword']
  403. fid = group['fid']
  404. if name:
  405. db_name = name
  406. else:
  407. db_name = num
  408. print(fid, keyword, db_name)
  409. print(item_url)
  410. #shop_info
  411. print('parsing shop info....')
  412. for i in range(5):
  413. print('shop info try...{}'.format(i))
  414. driver.get(item_url)
  415. time.sleep(3)
  416. wait = WebDriverWait(driver, 10)
  417. wait.until(
  418. EC.element_to_be_clickable((By.ID, 'sb_cb50'))
  419. )
  420. element = driver.find_element_by_id('sb_cb50')
  421. driver.implicitly_wait(10)
  422. ActionChains(driver).move_to_element(element).click(element).perform()
  423. time.sleep(3)
  424. driver.back()
  425. if driver.current_url == item_url:continue
  426. print(driver.current_url)
  427. output = process_web_request_start(driver, fid)
  428. if output != 0: break
  429. # reivews
  430. print('parsing reviews....')
  431. if not output['user_ratings_total']:
  432. output['reviews'] = ''
  433. else:
  434. for i in range(3):
  435. print('reviews try...{}'.format(i))
  436. try:
  437. wait = WebDriverWait(driver, 30)
  438. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  439. wait.until(
  440. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  441. )
  442. element = driver.find_element_by_css_selector(more_reviews_css)
  443. driver.implicitly_wait(10)
  444. ActionChains(driver).move_to_element(element).click(element).perform()
  445. time.sleep(0.5)
  446. output_ = process_web_request_reviews(driver, output, output['ludocid'])
  447. if output_ != 0:
  448. output = output_
  449. break
  450. except:
  451. driver.get(item_url)
  452. time.sleep(0.5)
  453. if 'reviews' not in output.keys():
  454. continue
  455. # photo
  456. print('parsing photo....')
  457. if output['header_image'] != '':
  458. for i in range(3):
  459. print('photo try...{}'.format(i))
  460. driver.get(item_url)
  461. time.sleep(0.5)
  462. print(driver.current_url)
  463. try:
  464. wait = WebDriverWait(driver, 30)
  465. wait.until(
  466. EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
  467. )
  468. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  469. ActionChains(driver).move_to_element(element).click(element).perform()
  470. output = process_web_request_photo(driver, output, fid)
  471. break
  472. except:
  473. pass
  474. else:
  475. output['shop_photo'] = '[]'
  476. output['menu_photo'] = '[]'
  477. output['item_url'] = item_url
  478. output['keyword'] = keyword
  479. if output['google_url'] == '':
  480. query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
  481. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  482. output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  483. print(output)
  484. save_js_to_db(output, fid)
  485. error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
  486. print('*'*10)
  487. except TimeoutException as e:
  488. traceback.print_exc()
  489. break
  490. except TimeoutException as e:
  491. traceback.print_exc()
  492. break
  493. except:
  494. error_table3 = db['error_list3']
  495. error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  496. traceback.print_exc()
  497. if __name__ == '__main__':
  498. main()