run4.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. # -*- coding: utf-8 -*-
  2. #from selenium import webdriver
  3. from tkinter.tix import TEXT
  4. from seleniumwire import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.support.wait import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. import selenium
  11. import traceback
  12. from bs4 import BeautifulSoup
  13. from utility import database_access as DA
  14. from utility.parseutils import *
  15. from utility.connect import *
  16. from datetime import datetime
  17. from requests import session
  18. import pandas as pd
  19. import dataset
  20. import time
  21. import json
  22. import re
  23. import sys, os
  24. import socket
  25. import brotli
  26. import pickle
  27. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  28. import urllib.parse
  29. chrome_window=False
  30. globalkw=None
  31. proxyport=8787
  32. def write_to_file(jsobj,fname):
  33. with open(fname, 'wb') as handle:
  34. pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  35. def build_cache(db):
  36. global reviews_table
  37. id_dict={}
  38. cursor = db.query('SELECT fid FROM google_poi.shop_list3;')
  39. for c in cursor:
  40. key = '{}'.format(c['fid'])
  41. id_dict[key]=1
  42. return id_dict
  43. def brower_start(port):
  44. global proxyport
  45. global chrome_window
  46. print(proxyport)
  47. options = webdriver.ChromeOptions()
  48. if chrome_window:
  49. browser = webdriver.Chrome(
  50. desired_capabilities=options.to_capabilities()
  51. )
  52. else:
  53. chrome_options = webdriver.ChromeOptions()
  54. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  55. chrome_options.add_argument('--ignore-certificate-errors')
  56. chrome_options.add_argument("--no-sandbox")
  57. chrome_options.add_argument("--disable-dev-shm-usage")
  58. browser = webdriver.Remote(
  59. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  60. desired_capabilities=chrome_options.to_capabilities(),
  61. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  62. )
  63. browser.set_window_size(1400,1000)
  64. return browser
  65. def get_next_job(db):
  66. result = {}
  67. result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 10')
  68. url_pd = pd.DataFrame([dict(i) for i in result])
  69. url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
  70. # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  71. remove = db.query('select item_url from error_list3')
  72. remove = pd.DataFrame([dict(i) for i in remove])
  73. if len(remove) != 0:
  74. remove_fid_list = remove['item_url'].to_list()
  75. url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
  76. return url_pd
  77. def parsing_js(resp):
  78. txt = json.loads(resp[5::])
  79. output = {}
  80. output['name'] = txt[6][11]
  81. output['adress_name'] = txt[6][18]
  82. if txt[6][4]:
  83. if txt[6][4][7]:
  84. output['rating'] = str(txt[6][4][7])
  85. else:
  86. output['rating'] = ''
  87. if txt[6][4][8]:
  88. output['user_ratings_total'] = str(txt[6][4][8])
  89. else:
  90. output['user_ratings_total'] = ''
  91. if txt[6][4][2]:
  92. output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
  93. else:
  94. output['price_level'] = ''
  95. else:
  96. output['rating'] = ''
  97. output['user_ratings_total'] = ''
  98. output['price_level'] = ''
  99. if txt[6][37][0]:
  100. output['lon'] = txt[6][37][0][0][8][0][1]
  101. output['lat'] = txt[6][37][0][0][8][0][2]
  102. else:
  103. output['lon'] = None
  104. output['lat'] = None
  105. if txt[6][178]:
  106. output['tel'] = txt[6][178][0][0]
  107. else:
  108. output['tel'] = ''
  109. if txt[6][13]:
  110. output['category'] = txt[6][13][0]
  111. else:
  112. output['category'] = ''
  113. try:
  114. location = txt[6][183][2][2][0]
  115. if location:
  116. location_s = location.split(' ')
  117. output['city'], output['area'] = location_s[-1], location_s[-2]
  118. else:
  119. output['city'], output['area'] = '', ''
  120. except:
  121. output['city'], output['area'] = '', ''
  122. if txt[6][100]:
  123. for item in txt[6][100][1]:
  124. name = item[1]
  125. if name not in intro_list.keys(): continue
  126. name_map = intro_list[name]
  127. c = 0
  128. detail = []
  129. for t in item[2]:
  130. value = t[1]
  131. if t[3] == 1:
  132. detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
  133. else:
  134. detail += [{'id':c, name_map[1]:value}]
  135. c += 1
  136. output[name_map[0]] = str(detail)
  137. for key in intro_list:
  138. if intro_list[key][0] not in output.keys():
  139. output[intro_list[key][0]] = '[]'
  140. if txt[6][34]:
  141. output = time_parsing_js(txt[6][34], output)
  142. else:
  143. output['open_now'] = 'False'
  144. output['periods'] = ''
  145. output['weekday_text'] = ''
  146. output['time_status'] = ''
  147. if txt[6][72]:
  148. output['header_image'] = txt[6][72][0][0][6][0]
  149. else:
  150. output['header_image'] = ''
  151. print(output)
  152. # write_to_file(orig,'debug.pickle')
  153. return output
  154. def time_parsing_js(time_json, output):
  155. weekday_text = []
  156. periods = []
  157. for time_ in time_json[1]:
  158. week = time_[0]
  159. weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
  160. for t in time_[1]:
  161. if t == '24 小時營業':
  162. periods += [{
  163. "open":{
  164. "day": week_list[week],
  165. "time": '0000'
  166. },
  167. "close":{
  168. "day": week_list[week],
  169. "time": ''
  170. }
  171. }]
  172. elif t == '休息':
  173. periods += [{
  174. "open":{
  175. "day": week_list[week],
  176. "time": ''
  177. },
  178. "close":{
  179. "day": week_list[week],
  180. "time": ''
  181. }
  182. }]
  183. else:
  184. start, end = t.split('–')
  185. end_hour, end_min = end.split(':')
  186. start_hour, start_min = start.split(':')
  187. if end_hour < start_hour:
  188. end_day = week_list[week] + 1
  189. else:
  190. end_day = week_list[week]
  191. periods += [{
  192. "open":{
  193. "day": week_list[week],
  194. "time": start.replace(':','')
  195. },
  196. "close":{
  197. "day": end_day,
  198. "time": end.replace(':','')
  199. }
  200. }]
  201. output['periods'] = str(periods)
  202. output['weekday_text'] = str(weekday_text)
  203. output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
  204. if output['time_status'].find('永久停業') != -1 or\
  205. output['time_status'].find('暫時關閉') != -1 or\
  206. output['time_status'].find('暫停營業') != -1:
  207. output['open_now'] = 'False'
  208. else:
  209. output['open_now'] = 'True'
  210. return output
  211. def save_js_to_db(jsobj, fid):
  212. global shop_table
  213. global iddict
  214. jsobj['fid'] = fid
  215. if iddict.get(fid) is None:
  216. try:
  217. shop_table.insert(jsobj)
  218. except:
  219. traceback.print_exc()
  220. def process_web_request_start(driver, fid):
  221. time.sleep(3)
  222. print("start&**********************")
  223. for request in driver.requests:
  224. if request.response:
  225. # print(request.url)
  226. if 'place?' in request.url :
  227. print('parsing js:')
  228. front, _ = fid.split(':')
  229. if request.url.find(front) != -1:
  230. print(request.url)
  231. resp = brotli.decompress(request.response.body)
  232. jstext = resp.decode('utf-8')
  233. output = parsing_js(jstext)
  234. time.sleep(1)
  235. return output, request.url
  236. return 0, 0
  237. def reviews_parsing_js(resp):
  238. columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
  239. 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
  240. jsobj = json.loads(resp[5::])
  241. result = []
  242. for i in range(len(jsobj[2])):
  243. tmp = []
  244. tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
  245. tmp += [jsobj[2][i][1], jsobj[2][i][3]]
  246. # image
  247. image = []
  248. if jsobj[2][i][14]:
  249. for j in range(len(jsobj[2][i][14])):
  250. image += [jsobj[2][i][14][j][6][0]]
  251. tmp += [image]
  252. #rating
  253. tmp += [jsobj[2][i][4]]
  254. # store reply
  255. if jsobj[2][i][9]:
  256. tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
  257. else:
  258. tmp += ['', '']
  259. result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
  260. return result
  261. def process_web_request_reviews(driver, output, start_js):
  262. time.sleep(3)
  263. print("reviews&**********************")
  264. for request in driver.requests:
  265. if request.response:
  266. # print(request.url)
  267. if 'listentitiesreviews?' in request.url :
  268. print('parsing js:')
  269. if start_js.find(request.url.split('!')[-2]) != -1:
  270. print(request.url)
  271. resp = brotli.decompress(request.response.body)
  272. jstext = resp.decode('utf-8')
  273. result = reviews_parsing_js(jstext)
  274. output['reviews'] = str(result)
  275. time.sleep(1)
  276. return output
  277. return 0
  278. def photos_parsing_js(resp):
  279. def image_url_change_size(url):
  280. if url.find('streetviewpixels') != -1:
  281. return url
  282. else:
  283. url_split = url.split('=')
  284. new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
  285. return new_url
  286. jsobj = json.loads(resp[5::])
  287. # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
  288. menu = []
  289. all = []
  290. photo_category_map = {}
  291. for row in jsobj[12][0]:
  292. photo_category_map[row[0]] = row[2]
  293. if photo_category_map[jsobj[13][0]] == '全部':
  294. for img in jsobj[0][:5]:
  295. all += [image_url_change_size(img[6][0])]
  296. elif photo_category_map[jsobj[13][0]] == '菜單':
  297. for img in jsobj[0][:5]:
  298. menu += [image_url_change_size(img[6][0])]
  299. return menu, all
  300. def process_web_request_photo(driver, output, fid):
  301. try:
  302. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
  303. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  304. tab_dict = {}
  305. for tab_index in [0, 1, 2]:
  306. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  307. if len(selector) != 0:
  308. photo_name = selector[0].text
  309. if photo_name == '菜單':
  310. tab_dict[photo_name] = tab_index
  311. elif photo_name == '全部':
  312. tab_dict[photo_name] = tab_index
  313. except:
  314. tab_dict = {}
  315. print(tab_dict)
  316. for tab_ in tab_dict:
  317. tab_index = tab_dict[tab_]
  318. print(tab_index)
  319. wait = WebDriverWait(driver, 60)
  320. wait.until(
  321. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  322. )
  323. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  324. ActionChains(driver).move_to_element(element).click(element).perform()
  325. time.sleep(1)
  326. print("photo&**********************")
  327. menu_list = []
  328. all_list = []
  329. for request in driver.requests:
  330. if request.response:
  331. # print(request.url)
  332. if 'photo?' in request.url :
  333. print('parsing js:')
  334. front, _ = fid.split(':')
  335. if request.url.find(front) != -1:
  336. print(request.url)
  337. resp = brotli.decompress(request.response.body)
  338. jstext = resp.decode('utf-8')
  339. menu, all = photos_parsing_js(jstext)
  340. menu_list += menu
  341. all_list += all
  342. output['shop_photo'] = str(all_list[:5])
  343. output['menu_photo'] = str(menu_list[:5])
  344. return output
  345. def main():
  346. global chrome_window
  347. global store_list_table
  348. global shop_table
  349. global proxyport
  350. global iddict
  351. localip=socket.gethostbyname(socket.gethostname())
  352. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  353. store_list_table = db['swire_store_list']
  354. shop_table = db['shop_list3']
  355. error_table = db['error_list2']
  356. iddict=build_cache(db)
  357. port=4444
  358. if len(sys.argv) == 3 :
  359. port=int(sys.argv[1])
  360. proxyport=int(sys.argv[2])
  361. if not chrome_window:
  362. print('restart docker pw{}'.format(port))
  363. # os.system('sudo docker container restart p'+str(port))
  364. os.system('sudo docker container restart pw'+str(port))
  365. time.sleep(10)
  366. print('drvier start...')
  367. driver = brower_start(port)
  368. job = get_next_job(db)
  369. for row, group in job.iterrows():
  370. try:
  371. item_url = group['item_url']
  372. name = group['name']
  373. num = group['num']
  374. keyword = group['keyword']
  375. fid = group['fid']
  376. if name:
  377. db_name = name
  378. else:
  379. db_name = num
  380. print(fid, keyword, db_name)
  381. print(item_url)
  382. # ActionChains(driver).key_down(Keys.SHIFT).key_down(Keys.F5).perform()
  383. # driver.find_element_by_tag_name('body').send_keys(Keys.F5)
  384. # time.sleep(3)
  385. #shop_info
  386. print('parsing shop info....')
  387. for i in range(5):
  388. print('shop info try...{}'.format(i))
  389. driver.get(item_url)
  390. time.sleep(3)
  391. wait = WebDriverWait(driver, 10)
  392. wait.until(
  393. EC.element_to_be_clickable((By.ID, 'sb_cb50'))
  394. )
  395. element = driver.find_element_by_id('sb_cb50')
  396. driver.implicitly_wait(10)
  397. ActionChains(driver).move_to_element(element).click(element).perform()
  398. time.sleep(3)
  399. driver.back()
  400. if driver.current_url == item_url:continue
  401. print(driver.current_url)
  402. output, start_js = process_web_request_start(driver, fid)
  403. if output != 0: break
  404. # reivews
  405. print('parsing reviews....')
  406. if output['user_ratings_total'] == '':
  407. output['reviews'] = ''
  408. else:
  409. for i in range(3):
  410. print('reviews try...{}'.format(i))
  411. try:
  412. wait = WebDriverWait(driver, 30)
  413. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  414. wait.until(
  415. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  416. )
  417. element = driver.find_element_by_css_selector(more_reviews_css)
  418. driver.implicitly_wait(10)
  419. ActionChains(driver).move_to_element(element).click(element).perform()
  420. time.sleep(0.5)
  421. output_ = process_web_request_reviews(driver, output, start_js)
  422. if output_ != 0:
  423. output = output_
  424. break
  425. except:
  426. driver.get(item_url)
  427. time.sleep(0.5)
  428. if 'reviews' not in output.keys():
  429. continue
  430. # photo
  431. print('parsing photo....')
  432. if output['header_image'] != '':
  433. for i in range(3):
  434. print('photo try...{}'.format(i))
  435. driver.get(item_url)
  436. time.sleep(0.5)
  437. print(driver.current_url)
  438. try:
  439. wait = WebDriverWait(driver, 30)
  440. wait.until(
  441. EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
  442. )
  443. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  444. ActionChains(driver).move_to_element(element).click(element).perform()
  445. output = process_web_request_photo(driver, output, fid)
  446. break
  447. except:
  448. pass
  449. else:
  450. output['shop_photo'] = '[]'
  451. output['menu_photo'] = '[]'
  452. print(output)
  453. query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
  454. output['item_url'] = item_url
  455. output['keyword'] = keyword
  456. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  457. output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
  458. save_js_to_db(output, fid)
  459. error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
  460. print('*'*10)
  461. except:
  462. error_table3 = db['error_list3']
  463. error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  464. traceback.print_exc()
  465. if __name__ == '__main__':
  466. main()