noodles 2 years ago
parent
commit
1519e473bd
1 changed files with 546 additions and 0 deletions
  1. 546 0
      run4.py

+ 546 - 0
run4.py

@@ -0,0 +1,546 @@
+# -*- coding: utf-8 -*-
+#from selenium import webdriver
+from tkinter.tix import TEXT
+from seleniumwire import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+import selenium
+import traceback
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+
+from datetime import datetime
+from requests import session
+import pandas as pd
+import dataset
+import time
+import json
+import re
+import sys, os
+import socket
+import brotli
+import pickle
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import urllib.parse
+chrome_window=False
+globalkw=None
+proxyport=8787
+
+db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
+              'created_at', 'text', 'photos', 'store_review_time','store_review']
+
+
+def write_to_file(jsobj,fname):
+    with open(fname, 'wb') as handle:
+        pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    # import codecs
+    # fw=codecs.open(fname,'w','utf-8')
+    # fw.write(str(jsobj))
+    # fw.close()
+
+
+def build_cache(db):
+    global reviews_table
+    id_dict={}
+    cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
+
+    for c in cursor:
+        key = '{}_{}'.format(c['fid'],c['author_id'])
+        id_dict[key]=1
+    return id_dict
+
+
+def brower_start(port):
+    global proxyport
+    global chrome_window
+    print(proxyport)
+    options = webdriver.ChromeOptions()
+    if chrome_window:
+        browser = webdriver.Chrome(
+            desired_capabilities=options.to_capabilities()
+        )
+    else:
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
+        chrome_options.add_argument('--ignore-certificate-errors')
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        browser = webdriver.Remote(
+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+            desired_capabilities=chrome_options.to_capabilities(),
+            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
+
+            )
+        browser.set_window_size(1400,1000)
+    return browser
+
+
+def get_next_job(db):
+    result = {}
+    result = db.query('select * from error_list2 ORDER BY RAND() limit 2')
+    url_pd = pd.DataFrame([dict(i) for i in result])
+
+    # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
+
+    # remove = db.query('select fid from review_process')
+    # remove = pd.DataFrame([dict(i) for i in remove])
+    # remove_fid_list = remove['fid'].to_list()
+
+    # url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
+
+    return url_pd
+
+
+def parsing_js(resp, db_name):
+    txt = json.loads(resp[5::])
+
+    output = {}
+
+    if txt[6][11] != db_name:
+        return 0
+    output['name'] = txt[6][11]
+    output['adress_name'] = txt[6][18]
+
+    if txt[6][4]:
+        if txt[6][4][7]:
+            output['rating'] = str(txt[6][4][7])
+        else:
+            output['rating'] = ''
+
+        if txt[6][4][8]:
+            output['user_ratings_total'] = str(txt[6][4][8])
+        else:
+            output['user_ratings_total'] = ''
+
+        if txt[6][4][2]:
+            output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
+        else:
+            output['price_level'] = ''
+    else:
+        output['rating'] = ''
+        output['user_ratings_total'] = ''
+        output['price_level'] = ''
+
+    if txt[6][37][0]:
+        output['lon'] = txt[6][37][0][0][8][0][1]
+        output['lat'] = txt[6][37][0][0][8][0][2]    
+    else:
+        output['lon'] = None
+        output['lat'] = None
+
+    if txt[6][178]:
+        output['tel'] = txt[6][178][0][0]
+    else:
+        output['tel'] = ''
+    
+    if txt[6][13]:
+        output['category'] = txt[6][13][0]
+    else:
+        output['category'] = ''
+
+    try:
+        location = txt[6][183][2][2][0]
+        if location:
+            location_s = location.split(' ')
+            output['city'], output['area'] = location_s[-1], location_s[-2]
+        else:
+            output['city'], output['area'] = '', ''
+    except:
+        output['city'], output['area'] = '', ''
+
+    if txt[6][100]:
+        for item in txt[6][100][1]:
+            name = item[1]
+            if name not in intro_list.keys(): continue
+            name_map = intro_list[name]
+            c = 0
+            detail = []
+            for t in item[2]:
+                value = t[1]
+                if t[3] == 1:
+                    detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
+                else:
+                    detail += [{'id':c, name_map[1]:value}]
+                c += 1
+            output[name_map[0]] = str(detail)
+
+    for key in intro_list:
+        if intro_list[key][0] not in output.keys():
+            output[intro_list[key][0]] = '[]'
+            
+    if txt[6][34]:
+        output = time_parsing_js(txt[6][34], output)
+    else:
+        output['open_now'] = 'False'
+        output['periods'] = ''
+        output['weekday_text'] = ''
+        output['time_status'] = ''
+    
+    if txt[6][72]:
+        output['header_image'] = txt[6][72][0][0][6][0]
+    else:
+        output['header_image'] = ''
+
+    print(output)
+    # write_to_file(orig,'debug.pickle')
+    return output 
+
+
+def time_parsing_js(time_json, output):
+    weekday_text = []
+    periods = []
+
+    for time_ in time_json[1]:
+        week = time_[0]
+        weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
+        
+        for t in time_[1]:
+            if t == '24 小時營業':
+                periods += [{
+                                "open":{
+                                    "day": week_list[week], 
+                                    "time": '0000'
+                                },
+                                "close":{
+                                    "day": week_list[week], 
+                                    "time": ''
+                                }
+                            }]
+            elif t == '休息':
+                periods += [{
+                            "open":{
+                                "day": week_list[week], 
+                                "time": ''
+                            },
+                            "close":{
+                                "day": week_list[week], 
+                                "time": ''
+                            }
+                        }]
+            else:
+                start, end = t.split('–')
+                end_hour, end_min = end.split(':')
+                start_hour, start_min = start.split(':')
+
+                if end_hour < start_hour:
+                    end_day = week_list[week] + 1 
+                else:
+                    end_day = week_list[week]
+
+                periods += [{
+                    "open":{
+                        "day": week_list[week], 
+                        "time": start.replace(':','')
+                    },
+                    "close":{
+                        "day": end_day, 
+                        "time": end.replace(':','')
+                    }
+                }]
+                
+    output['periods'] = str(periods)
+    output['weekday_text'] = str(weekday_text)
+    output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
+
+    if output['time_status'].find('永久停業') != -1 or\
+       output['time_status'].find('暫時關閉') != -1 or\
+       output['time_status'].find('暫停營業') != -1:
+        output['open_now'] = 'False'
+    else:
+        output['open_now'] = 'True'
+
+    return output
+
+
+def save_js_to_db(jsobj, fid):
+    global reviews_table
+    global iddict
+    for r in jsobj:
+        r['fid'] = fid
+        key = '{}_{}'.format(r['fid'], r['author_id'])
+        if iddict.get(key) is not None:
+            continue
+        try:
+            r['review_image'] = str(r['review_image'])
+            reviews_table.insert(r)
+        except:
+            traceback.print_exc()
+
+
+def process_web_request_start(driver, db_name):
+    time.sleep(5)
+
+    print("start&**********************")
+    for request in driver.requests:
+        if request.response:
+            # print(request.url)
+            if 'place?' in request.url :
+                print('parsing js:')
+                print(request.url)
+                resp = brotli.decompress(request.response.body)
+                jstext = resp.decode('utf-8')
+                output = parsing_js(jstext, db_name)
+                time.sleep(1)
+                return output
+    return 0
+
+
+def reviews_parsing_js(resp):
+    columns_name = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
+              'review_time', 'review_content', 'review_image',
+              'rating', 'store_review_time','store_review']
+    jsobj = json.loads(resp[5::])
+    result = []
+    for i in range(len(jsobj[2])):
+        tmp = []
+        tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
+        tmp += [jsobj[2][i][1], jsobj[2][i][3]]
+
+        # image
+        image = []
+        if jsobj[2][i][14]:
+            for j in range(len(jsobj[2][i][14])):
+                image += [jsobj[2][i][14][j][6][0]]
+        tmp += [image]
+
+        #rating
+        tmp += [jsobj[2][i][4]]
+
+        # store reply
+        if jsobj[2][i][9]:
+            tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
+        else:
+            tmp += ['', '']
+
+        result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
+
+    return result 
+
+
+def process_web_request_reviews(driver, output):
+    time.sleep(0.8)
+    time.sleep(3)
+
+    print("reviews&**********************")
+    for request in driver.requests:
+        if request.response:
+            # print(request.url)
+            if 'listentitiesreviews?' in request.url :
+                print('parsing js:')
+                print(request.url)
+                resp = brotli.decompress(request.response.body)
+                jstext = resp.decode('utf-8')
+                result = reviews_parsing_js(jstext)
+                output['reviews'] = str(result)
+                time.sleep(1)
+
+                return output
+
+
+def photos_parsing_js(resp, c):
+    def image_url_change_size(url):
+        url_split = url.split('=')
+        new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
+        return new_url
+
+    jsobj = json.loads(resp[5::])
+    # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
+
+    menu = []
+    all = []
+
+    if jsobj[10] == 0:
+        for img in jsobj[0]:
+            all += [image_url_change_size(img[6][0])]
+    else:
+        for img in jsobj[0]:
+            menu += [image_url_change_size(img[6][0])]
+
+    return menu, all
+    
+
+
+def process_web_request_photo(driver, output):
+    try:
+        driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
+        photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
+        tab_dict = {}
+        for tab_index in [0, 1, 2]:
+            selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
+            if len(selector) != 0:
+                photo_name = selector[0].text
+                if photo_name == '菜單':
+                    tab_dict[photo_name] = tab_index
+                elif photo_name == '全部':
+                    tab_dict[photo_name] = tab_index
+    except:
+        tab_dict = {}
+    print(tab_dict)
+
+    for tab_ in tab_dict:
+        tab_index = tab_dict[tab_]
+        print(tab_index)
+        wait = WebDriverWait(driver, 60)
+        wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
+        )
+        element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
+        ActionChains(driver).move_to_element(element).click(element).perform()
+        time.sleep(2)
+
+    print("photo&**********************")
+    menu_list = []
+    all_list = []
+    for request in driver.requests:
+        if request.response:
+            # print(request.url)
+            c = 0
+            if 'photo?' in request.url :
+                print('parsing js:')
+                print(request.url)
+                resp = brotli.decompress(request.response.body)
+                jstext = resp.decode('utf-8')
+                menu, all = photos_parsing_js(jstext, c)
+                menu_list += menu
+                all_list += all
+                c += 1
+
+    output['shop_photo'] = str(all_list)
+    output['menu_photo'] = str(menu_list)
+
+    return output
+    
+
+def main():
+    global chrome_window
+    global store_list_table
+    global reviews_table
+    global proxyport
+    global iddict
+
+    localip=socket.gethostbyname(socket.gethostname())
+
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    store_list_table = db['swire_store_list']
+    shop_table = db['shop_list4']
+
+    iddict=build_cache(db)
+    
+    port=4444
+    if len(sys.argv) == 3 :
+        port=int(sys.argv[1])
+        proxyport=int(sys.argv[2])
+    if not chrome_window:
+        print('restart docker pw{}'.format(port))
+#        os.system('sudo docker container restart p'+str(port))
+        os.system('sudo docker container restart pw'+str(port))
+
+        time.sleep(10)
+
+    print('drvier start...')
+    driver = brower_start(port)
+
+    job = get_next_job(db)
+
+    for row, group in job.iterrows():
+        try:
+            item_url = group['item_url']
+            name = group['name']
+            num = group['num']
+            keyword = group['keyword']
+
+            if name:
+                db_name = name
+            else:
+                db_name = num
+
+            print(name, num, keyword, db_name)
+            print(item_url)
+
+            #shop_info
+            print('parsing shop info....')
+            for i in range(5):
+                print('shop info try...{}'.format(i))
+                driver.get(item_url)
+                time.sleep(2)
+
+                element = driver.find_element_by_id('searchbox-searchbutton')
+                driver.implicitly_wait(10)
+                ActionChains(driver).move_to_element(element).click(element).perform()
+                time.sleep(5)
+                driver.back()
+
+                if driver.current_url == item_url:continue
+                print(driver.current_url)
+                output = process_web_request_start(driver, db_name)
+                if output != 0: break
+
+
+            # reivews
+            print('parsing reviews....')
+            if output['user_ratings_total'] == '':
+                output['reviews'] = ''
+            else:
+                for i in range(3):
+                    print('reviews try...{}'.format(i))
+                    try:
+                        wait = WebDriverWait(driver, 30)
+                        more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
+                        wait.until(
+                            EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
+                        )
+                        element = driver.find_element_by_css_selector(more_reviews_css)
+                        driver.implicitly_wait(10)
+                        ActionChains(driver).move_to_element(element).click(element).perform()
+                        time.sleep(0.5)
+
+                        output = process_web_request_reviews(driver, output)
+                        break
+                    except:
+                        driver.get(item_url)
+                        time.sleep(0.5)
+
+            # photo
+            print('parsing photo....')
+            if output['header_image'] != '':
+                for i in range(3):
+                    print('photo try...{}'.format(i))
+                    driver.get(item_url)
+                    time.sleep(0.5)
+                    print(driver.current_url)
+                    try:
+                        wait = WebDriverWait(driver, 30)
+                        wait.until(
+                            EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
+                        )
+                        element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
+                        ActionChains(driver).move_to_element(element).click(element).perform()
+
+                        output = process_web_request_photo(driver, output)
+                        break
+                    except:
+                        pass
+
+            else:
+                output['shop_photo'] = '[]'
+                output['menu_photo'] = '[]'
+
+            print(output)
+            query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
+            output['item_url'] = item_url
+            output['keyword'] = keyword
+            output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
+            output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
+
+            shop_table.insert(output,['item_url'])
+        except:
+            traceback.print_exc()
+
+if __name__ == '__main__':
+    main()