3 vuotta sitten · 1519e473bd
--- a/run4.py
+++ b/run4.py
@@ -0,0 +1,546 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+#from selenium import webdriver
			
 
				+from tkinter.tix import TEXT
			
 
				+from seleniumwire import webdriver
			
 
				+from selenium.webdriver.common.action_chains import ActionChains
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.webdriver.support.wait import WebDriverWait
			
 
				+from selenium.webdriver.common.by import By
			
 
				+import selenium
			
 
				+import traceback
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+from utility import database_access as DA
			
 
				+from utility.parseutils import *
			
 
				+from utility.connect import *
			
 
				+
			
 
				+from datetime import datetime
			
 
				+from requests import session
			
 
				+import pandas as pd
			
 
				+import dataset
			
 
				+import time
			
 
				+import json
			
 
				+import re
			
 
				+import sys, os
			
 
				+import socket
			
 
				+import brotli
			
 
				+import pickle
			
 
				+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
			
 
				+import urllib.parse
			
 
				+chrome_window=False
			
 
				+globalkw=None
			
 
				+proxyport=8787
			
 
				+
			
 
				+db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
			
 
				+              'created_at', 'text', 'photos', 'store_review_time','store_review']
			
 
				+
			
 
				+
			
 
				+def write_to_file(jsobj,fname):
			
 
				+    with open(fname, 'wb') as handle:
			
 
				+        pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
			
 
				+    # import codecs
			
 
				+    # fw=codecs.open(fname,'w','utf-8')
			
 
				+    # fw.write(str(jsobj))
			
 
				+    # fw.close()
			
 
				+
			
 
				+
			
 
				+def build_cache(db):
			
 
				+    global reviews_table
			
 
				+    id_dict={}
			
 
				+    cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
			
 
				+
			
 
				+    for c in cursor:
			
 
				+        key = '{}_{}'.format(c['fid'],c['author_id'])
			
 
				+        id_dict[key]=1
			
 
				+    return id_dict
			
 
				+
			
 
				+
			
 
				+def brower_start(port):
			
 
				+    global proxyport
			
 
				+    global chrome_window
			
 
				+    print(proxyport)
			
 
				+    options = webdriver.ChromeOptions()
			
 
				+    if chrome_window:
			
 
				+        browser = webdriver.Chrome(
			
 
				+            desired_capabilities=options.to_capabilities()
			
 
				+        )
			
 
				+    else:
			
 
				+        chrome_options = webdriver.ChromeOptions()
			
 
				+        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
			
 
				+        chrome_options.add_argument('--ignore-certificate-errors')
			
 
				+        chrome_options.add_argument("--no-sandbox")
			
 
				+        chrome_options.add_argument("--disable-dev-shm-usage")
			
 
				+        browser = webdriver.Remote(
			
 
				+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
			
 
				+            desired_capabilities=chrome_options.to_capabilities(),
			
 
				+            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
			
 
				+
			
 
				+            )
			
 
				+        browser.set_window_size(1400,1000)
			
 
				+    return browser
			
 
				+
			
 
				+
			
 
				+def get_next_job(db):
			
 
				+    result = {}
			
 
				+    result = db.query('select * from error_list2 ORDER BY RAND() limit 2')
			
 
				+    url_pd = pd.DataFrame([dict(i) for i in result])
			
 
				+
			
 
				+    # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
			
 
				+
			
 
				+    # remove = db.query('select fid from review_process')
			
 
				+    # remove = pd.DataFrame([dict(i) for i in remove])
			
 
				+    # remove_fid_list = remove['fid'].to_list()
			
 
				+
			
 
				+    # url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
			
 
				+
			
 
				+    return url_pd
			
 
				+
			
 
				+
			
 
				+def parsing_js(resp, db_name):
			
 
				+    txt = json.loads(resp[5::])
			
 
				+
			
 
				+    output = {}
			
 
				+
			
 
				+    if txt[6][11] != db_name:
			
 
				+        return 0
			
 
				+    output['name'] = txt[6][11]
			
 
				+    output['adress_name'] = txt[6][18]
			
 
				+
			
 
				+    if txt[6][4]:
			
 
				+        if txt[6][4][7]:
			
 
				+            output['rating'] = str(txt[6][4][7])
			
 
				+        else:
			
 
				+            output['rating'] = ''
			
 
				+
			
 
				+        if txt[6][4][8]:
			
 
				+            output['user_ratings_total'] = str(txt[6][4][8])
			
 
				+        else:
			
 
				+            output['user_ratings_total'] = ''
			
 
				+
			
 
				+        if txt[6][4][2]:
			
 
				+            output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
			
 
				+        else:
			
 
				+            output['price_level'] = ''
			
 
				+    else:
			
 
				+        output['rating'] = ''
			
 
				+        output['user_ratings_total'] = ''
			
 
				+        output['price_level'] = ''
			
 
				+
			
 
				+    if txt[6][37][0]:
			
 
				+        output['lon'] = txt[6][37][0][0][8][0][1]
			
 
				+        output['lat'] = txt[6][37][0][0][8][0][2]    
			
 
				+    else:
			
 
				+        output['lon'] = None
			
 
				+        output['lat'] = None
			
 
				+
			
 
				+    if txt[6][178]:
			
 
				+        output['tel'] = txt[6][178][0][0]
			
 
				+    else:
			
 
				+        output['tel'] = ''
			
 
				+    
			
 
				+    if txt[6][13]:
			
 
				+        output['category'] = txt[6][13][0]
			
 
				+    else:
			
 
				+        output['category'] = ''
			
 
				+
			
 
				+    try:
			
 
				+        location = txt[6][183][2][2][0]
			
 
				+        if location:
			
 
				+            location_s = location.split(' ')
			
 
				+            output['city'], output['area'] = location_s[-1], location_s[-2]
			
 
				+        else:
			
 
				+            output['city'], output['area'] = '', ''
			
 
				+    except:
			
 
				+        output['city'], output['area'] = '', ''
			
 
				+
			
 
				+    if txt[6][100]:
			
 
				+        for item in txt[6][100][1]:
			
 
				+            name = item[1]
			
 
				+            if name not in intro_list.keys(): continue
			
 
				+            name_map = intro_list[name]
			
 
				+            c = 0
			
 
				+            detail = []
			
 
				+            for t in item[2]:
			
 
				+                value = t[1]
			
 
				+                if t[3] == 1:
			
 
				+                    detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
			
 
				+                else:
			
 
				+                    detail += [{'id':c, name_map[1]:value}]
			
 
				+                c += 1
			
 
				+            output[name_map[0]] = str(detail)
			
 
				+
			
 
				+    for key in intro_list:
			
 
				+        if intro_list[key][0] not in output.keys():
			
 
				+            output[intro_list[key][0]] = '[]'
			
 
				+            
			
 
				+    if txt[6][34]:
			
 
				+        output = time_parsing_js(txt[6][34], output)
			
 
				+    else:
			
 
				+        output['open_now'] = 'False'
			
 
				+        output['periods'] = ''
			
 
				+        output['weekday_text'] = ''
			
 
				+        output['time_status'] = ''
			
 
				+    
			
 
				+    if txt[6][72]:
			
 
				+        output['header_image'] = txt[6][72][0][0][6][0]
			
 
				+    else:
			
 
				+        output['header_image'] = ''
			
 
				+
			
 
				+    print(output)
			
 
				+    # write_to_file(orig,'debug.pickle')
			
 
				+    return output 
			
 
				+
			
 
				+
			
 
				+def time_parsing_js(time_json, output):
			
 
				+    weekday_text = []
			
 
				+    periods = []
			
 
				+
			
 
				+    for time_ in time_json[1]:
			
 
				+        week = time_[0]
			
 
				+        weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
			
 
				+        
			
 
				+        for t in time_[1]:
			
 
				+            if t == '24 小時營業':
			
 
				+                periods += [{
			
 
				+                                "open":{
			
 
				+                                    "day": week_list[week], 
			
 
				+                                    "time": '0000'
			
 
				+                                },
			
 
				+                                "close":{
			
 
				+                                    "day": week_list[week], 
			
 
				+                                    "time": ''
			
 
				+                                }
			
 
				+                            }]
			
 
				+            elif t == '休息':
			
 
				+                periods += [{
			
 
				+                            "open":{
			
 
				+                                "day": week_list[week], 
			
 
				+                                "time": ''
			
 
				+                            },
			
 
				+                            "close":{
			
 
				+                                "day": week_list[week], 
			
 
				+                                "time": ''
			
 
				+                            }
			
 
				+                        }]
			
 
				+            else:
			
 
				+                start, end = t.split('–')
			
 
				+                end_hour, end_min = end.split(':')
			
 
				+                start_hour, start_min = start.split(':')
			
 
				+
			
 
				+                if end_hour < start_hour:
			
 
				+                    end_day = week_list[week] + 1 
			
 
				+                else:
			
 
				+                    end_day = week_list[week]
			
 
				+
			
 
				+                periods += [{
			
 
				+                    "open":{
			
 
				+                        "day": week_list[week], 
			
 
				+                        "time": start.replace(':','')
			
 
				+                    },
			
 
				+                    "close":{
			
 
				+                        "day": end_day, 
			
 
				+                        "time": end.replace(':','')
			
 
				+                    }
			
 
				+                }]
			
 
				+                
			
 
				+    output['periods'] = str(periods)
			
 
				+    output['weekday_text'] = str(weekday_text)
			
 
				+    output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
			
 
				+
			
 
				+    if output['time_status'].find('永久停業') != -1 or\
			
 
				+       output['time_status'].find('暫時關閉') != -1 or\
			
 
				+       output['time_status'].find('暫停營業') != -1:
			
 
				+        output['open_now'] = 'False'
			
 
				+    else:
			
 
				+        output['open_now'] = 'True'
			
 
				+
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+def save_js_to_db(jsobj, fid):
			
 
				+    global reviews_table
			
 
				+    global iddict
			
 
				+    for r in jsobj:
			
 
				+        r['fid'] = fid
			
 
				+        key = '{}_{}'.format(r['fid'], r['author_id'])
			
 
				+        if iddict.get(key) is not None:
			
 
				+            continue
			
 
				+        try:
			
 
				+            r['review_image'] = str(r['review_image'])
			
 
				+            reviews_table.insert(r)
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+def process_web_request_start(driver, db_name):
			
 
				+    time.sleep(5)
			
 
				+
			
 
				+    print("start&**********************")
			
 
				+    for request in driver.requests:
			
 
				+        if request.response:
			
 
				+            # print(request.url)
			
 
				+            if 'place?' in request.url :
			
 
				+                print('parsing js:')
			
 
				+                print(request.url)
			
 
				+                resp = brotli.decompress(request.response.body)
			
 
				+                jstext = resp.decode('utf-8')
			
 
				+                output = parsing_js(jstext, db_name)
			
 
				+                time.sleep(1)
			
 
				+                return output
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def reviews_parsing_js(resp):
			
 
				+    columns_name = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
			
 
				+              'review_time', 'review_content', 'review_image',
			
 
				+              'rating', 'store_review_time','store_review']
			
 
				+    jsobj = json.loads(resp[5::])
			
 
				+    result = []
			
 
				+    for i in range(len(jsobj[2])):
			
 
				+        tmp = []
			
 
				+        tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
			
 
				+        tmp += [jsobj[2][i][1], jsobj[2][i][3]]
			
 
				+
			
 
				+        # image
			
 
				+        image = []
			
 
				+        if jsobj[2][i][14]:
			
 
				+            for j in range(len(jsobj[2][i][14])):
			
 
				+                image += [jsobj[2][i][14][j][6][0]]
			
 
				+        tmp += [image]
			
 
				+
			
 
				+        #rating
			
 
				+        tmp += [jsobj[2][i][4]]
			
 
				+
			
 
				+        # store reply
			
 
				+        if jsobj[2][i][9]:
			
 
				+            tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
			
 
				+        else:
			
 
				+            tmp += ['', '']
			
 
				+
			
 
				+        result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
			
 
				+
			
 
				+    return result 
			
 
				+
			
 
				+
			
 
				+def process_web_request_reviews(driver, output):
			
 
				+    time.sleep(0.8)
			
 
				+    time.sleep(3)
			
 
				+
			
 
				+    print("reviews&**********************")
			
 
				+    for request in driver.requests:
			
 
				+        if request.response:
			
 
				+            # print(request.url)
			
 
				+            if 'listentitiesreviews?' in request.url :
			
 
				+                print('parsing js:')
			
 
				+                print(request.url)
			
 
				+                resp = brotli.decompress(request.response.body)
			
 
				+                jstext = resp.decode('utf-8')
			
 
				+                result = reviews_parsing_js(jstext)
			
 
				+                output['reviews'] = str(result)
			
 
				+                time.sleep(1)
			
 
				+
			
 
				+                return output
			
 
				+
			
 
				+
			
 
				+def photos_parsing_js(resp, c):
			
 
				+    def image_url_change_size(url):
			
 
				+        url_split = url.split('=')
			
 
				+        new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
			
 
				+        return new_url
			
 
				+
			
 
				+    jsobj = json.loads(resp[5::])
			
 
				+    # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
			
 
				+
			
 
				+    menu = []
			
 
				+    all = []
			
 
				+
			
 
				+    if jsobj[10] == 0:
			
 
				+        for img in jsobj[0]:
			
 
				+            all += [image_url_change_size(img[6][0])]
			
 
				+    else:
			
 
				+        for img in jsobj[0]:
			
 
				+            menu += [image_url_change_size(img[6][0])]
			
 
				+
			
 
				+    return menu, all
			
 
				+    
			
 
				+
			
 
				+
			
 
				+def process_web_request_photo(driver, output):
			
 
				+    try:
			
 
				+        driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
			
 
				+        photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				+        tab_dict = {}
			
 
				+        for tab_index in [0, 1, 2]:
			
 
				+            selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
			
 
				+            if len(selector) != 0:
			
 
				+                photo_name = selector[0].text
			
 
				+                if photo_name == '菜單':
			
 
				+                    tab_dict[photo_name] = tab_index
			
 
				+                elif photo_name == '全部':
			
 
				+                    tab_dict[photo_name] = tab_index
			
 
				+    except:
			
 
				+        tab_dict = {}
			
 
				+    print(tab_dict)
			
 
				+
			
 
				+    for tab_ in tab_dict:
			
 
				+        tab_index = tab_dict[tab_]
			
 
				+        print(tab_index)
			
 
				+        wait = WebDriverWait(driver, 60)
			
 
				+        wait.until(
			
 
				+            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
			
 
				+        )
			
 
				+        element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
			
 
				+        ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				+        time.sleep(2)
			
 
				+
			
 
				+    print("photo&**********************")
			
 
				+    menu_list = []
			
 
				+    all_list = []
			
 
				+    for request in driver.requests:
			
 
				+        if request.response:
			
 
				+            # print(request.url)
			
 
				+            c = 0
			
 
				+            if 'photo?' in request.url :
			
 
				+                print('parsing js:')
			
 
				+                print(request.url)
			
 
				+                resp = brotli.decompress(request.response.body)
			
 
				+                jstext = resp.decode('utf-8')
			
 
				+                menu, all = photos_parsing_js(jstext, c)
			
 
				+                menu_list += menu
			
 
				+                all_list += all
			
 
				+                c += 1
			
 
				+
			
 
				+    output['shop_photo'] = str(all_list)
			
 
				+    output['menu_photo'] = str(menu_list)
			
 
				+
			
 
				+    return output
			
 
				+    
			
 
				+
			
 
				+def main():
			
 
				+    global chrome_window
			
 
				+    global store_list_table
			
 
				+    global reviews_table
			
 
				+    global proxyport
			
 
				+    global iddict
			
 
				+
			
 
				+    localip=socket.gethostbyname(socket.gethostname())
			
 
				+
			
 
				+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
			
 
				+    store_list_table = db['swire_store_list']
			
 
				+    shop_table = db['shop_list4']
			
 
				+
			
 
				+    iddict=build_cache(db)
			
 
				+    
			
 
				+    port=4444
			
 
				+    if len(sys.argv) == 3 :
			
 
				+        port=int(sys.argv[1])
			
 
				+        proxyport=int(sys.argv[2])
			
 
				+    if not chrome_window:
			
 
				+        print('restart docker pw{}'.format(port))
			
 
				+#        os.system('sudo docker container restart p'+str(port))
			
 
				+        os.system('sudo docker container restart pw'+str(port))
			
 
				+
			
 
				+        time.sleep(10)
			
 
				+
			
 
				+    print('drvier start...')
			
 
				+    driver = brower_start(port)
			
 
				+
			
 
				+    job = get_next_job(db)
			
 
				+
			
 
				+    for row, group in job.iterrows():
			
 
				+        try:
			
 
				+            item_url = group['item_url']
			
 
				+            name = group['name']
			
 
				+            num = group['num']
			
 
				+            keyword = group['keyword']
			
 
				+
			
 
				+            if name:
			
 
				+                db_name = name
			
 
				+            else:
			
 
				+                db_name = num
			
 
				+
			
 
				+            print(name, num, keyword, db_name)
			
 
				+            print(item_url)
			
 
				+
			
 
				+            #shop_info
			
 
				+            print('parsing shop info....')
			
 
				+            for i in range(5):
			
 
				+                print('shop info try...{}'.format(i))
			
 
				+                driver.get(item_url)
			
 
				+                time.sleep(2)
			
 
				+
			
 
				+                element = driver.find_element_by_id('searchbox-searchbutton')
			
 
				+                driver.implicitly_wait(10)
			
 
				+                ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				+                time.sleep(5)
			
 
				+                driver.back()
			
 
				+
			
 
				+                if driver.current_url == item_url:continue
			
 
				+                print(driver.current_url)
			
 
				+                output = process_web_request_start(driver, db_name)
			
 
				+                if output != 0: break
			
 
				+
			
 
				+
			
 
				+            # reivews
			
 
				+            print('parsing reviews....')
			
 
				+            if output['user_ratings_total'] == '':
			
 
				+                output['reviews'] = ''
			
 
				+            else:
			
 
				+                for i in range(3):
			
 
				+                    print('reviews try...{}'.format(i))
			
 
				+                    try:
			
 
				+                        wait = WebDriverWait(driver, 30)
			
 
				+                        more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
			
 
				+                        wait.until(
			
 
				+                            EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
			
 
				+                        )
			
 
				+                        element = driver.find_element_by_css_selector(more_reviews_css)
			
 
				+                        driver.implicitly_wait(10)
			
 
				+                        ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				+                        time.sleep(0.5)
			
 
				+
			
 
				+                        output = process_web_request_reviews(driver, output)
			
 
				+                        break
			
 
				+                    except:
			
 
				+                        driver.get(item_url)
			
 
				+                        time.sleep(0.5)
			
 
				+
			
 
				+            # photo
			
 
				+            print('parsing photo....')
			
 
				+            if output['header_image'] != '':
			
 
				+                for i in range(3):
			
 
				+                    print('photo try...{}'.format(i))
			
 
				+                    driver.get(item_url)
			
 
				+                    time.sleep(0.5)
			
 
				+                    print(driver.current_url)
			
 
				+                    try:
			
 
				+                        wait = WebDriverWait(driver, 30)
			
 
				+                        wait.until(
			
 
				+                            EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
			
 
				+                        )
			
 
				+                        element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
			
 
				+                        ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				+
			
 
				+                        output = process_web_request_photo(driver, output)
			
 
				+                        break
			
 
				+                    except:
			
 
				+                        pass
			
 
				+
			
 
				+            else:
			
 
				+                output['shop_photo'] = '[]'
			
 
				+                output['menu_photo'] = '[]'
			
 
				+
			
 
				+            print(output)
			
 
				+            query_name = output['adress_name'].replace('（','').replace('）', '').replace(' ','')
			
 
				+            output['item_url'] = item_url
			
 
				+            output['keyword'] = keyword
			
 
				+            output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
			
 
				+            output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
			
 
				+
			
 
				+            shop_table.insert(output,['item_url'])
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()