noodles 2 anos atrás
pai
commit
1b708896c7
5 arquivos alterados com 582 adições e 0 exclusões
  1. 393 0
      shop_item_crawler.py
  2. 0 0
      utility/__init__.py
  3. 13 0
      utility/connect.py
  4. 99 0
      utility/database_access.py
  5. 77 0
      utility/parseutils.py

+ 393 - 0
shop_item_crawler.py

@@ -0,0 +1,393 @@
+# -*- coding: utf-8 -*-
+from seleniumwire import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+import selenium
+import traceback
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+
+from datetime import datetime
+import pandas as pd
+import dataset
+import requests
+import time
+import json
+import re
+import sys, os
+import socket
+import brotli
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import urllib.parse
+chrome_window=False
+#chrome_window=True
+
+globalkw=None
+proxyport=8787
+
+
+def build_cache(db):
+    id_dict={}
+    cursor = db.query('SELECT place_id FROM {}.{};'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_STORE_LIST))
+    for c in cursor:
+        id_dict[c['place_id']]=1
+    return id_dict
+
+
+def brower_start(port):
+    global proxyport
+    global chrome_window
+    print(proxyport)
+    options = webdriver.ChromeOptions()
+    if chrome_window:
+        browser = webdriver.Chrome(
+            desired_capabilities=options.to_capabilities()
+        )
+    else:
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
+        chrome_options.add_argument('--ignore-certificate-errors')
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        browser = webdriver.Remote(
+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+            desired_capabilities=chrome_options.to_capabilities(),
+            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
+
+            )
+#            seleniumwire_options = {'addr': '172.17.0.2','port':4444})
+        browser.set_window_size(1400,1000)
+    return browser
+
+
+def keyin_keyword(driver, keyword):
+    button = driver.find_element_by_id("searchbox")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
+    time.sleep(3)
+
+
+def scan_job(db, kw):
+    result = {'kw' : kw}
+    table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'],MYSQL_CONFIG['TABLE_CONV_LOG'])
+    cursor = db.query('select t1.num,next-prev as diff from {} t1, \
+        (SELECT num,max(id) mid  FROM {} group by num  ) t2 \
+            where t1.id=t2.mid having diff>0 order by rand()'.format(table_name, table_name))
+    for c in cursor:
+        result['num']=c['num']
+        break
+    cursor = db.query('select lat,lon,loc from {} where num ="'.format(TABLE_LAT_LON)+str(result['num'])+'"')
+    for c in cursor:
+        result['lat'] = c['lat']
+        result['lon'] = c['lon']
+        result['loc'] = c['loc']
+        return result
+
+
+def get_next_job(db, repeat=False, repkw=None, repnum=None):
+    global globalkw
+
+    result={}
+    cursor = db.query('select kw, num from {} where expand = 0 order by rand()'.format(TABLE_AREACODES))
+
+    for c in cursor:
+        if repkw is None:
+            repkw = c['kw']
+        result['kw'] = c['kw']
+        result['num'] = c['num']
+        break
+    if repkw is not None:
+        result['kw'] = repkw
+
+    if result.get('num') is not None:
+        cursor = db.query('select lat,lon,loc from {} where num ="{}"'.format(TABLE_LAT_LON, str(result['num'])))
+        for c in cursor:
+            result['lat']=c['lat']
+            result['lon']=c['lon']
+            result['loc']=c['loc']
+            break
+
+    if repeat and repkw!= 'REP':
+        result['kw']=repkw
+        result['num']=repnum
+
+    if 'REP' in repkw:
+        if repnum=='REP':
+            repnum=None
+            cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
+            for c in cursor:
+                repnum=c['num']
+                break
+        if repnum is None:
+            cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
+            for c in cursor:
+                repnum=c['num']
+                break
+
+        cursor = db.query('select  lat_txt,lon_txt,keyword,num from {} where num="{}" limit 1'.format(TABLE_STORE_LIST, str(repnum)))
+
+        for c in cursor:
+            result['kw']=c['keyword']
+            result['num']=c['num']
+            result['lat']=c['lat_txt']
+            result['lon']=c['lon_txt']
+            result['loc']=''
+            return result
+
+    if repeat:
+        cursor = db.query('select  lat_txt,lon_txt,keyword from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
+
+        for c in cursor:
+            result['kw']=c['keyword']
+            result['lat']=c['lat_txt']
+            result['lon']=c['lon_txt']
+
+    return result
+
+
+def write_to_file(jsobj,fname):
+    import codecs
+    fw=codecs.open(fname,'w','utf-8')
+    fw.write(str(jsobj))
+    fw.close()
+
+
+def parsing_js(orig):
+    resultobj=[]
+    content=""
+    lines=orig.split('\n')
+    for l in lines:
+        newl=l.replace('\\"','"')
+        newl=newl.replace('\\"','"')
+
+        content+=newl
+    result=re.search(r'\[\["',content)
+    print(result)
+    content_begin=result.start()
+
+    result=re.search(r'\]\]"',content)
+    print(result)
+
+    content_end=result.end()
+
+    jscontent=content[content_begin:content_end-1]
+#    write_to_file(jscontent,'c:/tmp/debug.txt')
+    jsobj=json.loads(jscontent)
+    for x in jsobj[0][1][1:]:
+        print(x[14][11])
+        print(x[14][9])
+        reviews_cnt=None
+        photo=None
+        rating=None
+        biz_id=None
+        loc_x=None
+        loc_y=None
+        addr_elmts=None
+        tel=None
+        try:
+            rating=x[14][4][7]
+            reviews_cnt=x[14][4][8]
+        except:
+            traceback.print_exc()
+
+        try:
+            photo=x[14][37][0][0][0]
+            num_photos=x[14][37][0][0][6][1]
+        except:
+            traceback.print_exc()
+
+        try:
+            loc_x=x[14][37][0][0][29][0]
+            loc_y=x[14][37][0][0][29][1]
+        except:
+            traceback.print_exc()
+
+        try:
+            biz_id=x[14][57][2]
+            tel=x[14][178][0][3]
+        except:
+            traceback.print_exc()
+
+        try:
+            addr_elmts=str(x[14][82])
+        except:
+            traceback.print_exc()
+
+        category=str(x[14][13])
+        topic=str(x[14][89])
+        print(x[14][13])
+
+        print(x[14][10])
+        print(x[14][2])
+        print(x[14][78])
+        try:
+            resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+        except:
+            traceback.print_exc()
+    return resultobj
+
+
+def save_js_to_db(jsobj,num,keyword):
+    global store_list_table
+    global iddict
+    for r in jsobj:
+        if iddict.get(r['place_id']) is not None:
+            continue
+        r['num']=num
+        r['keyword']=keyword
+
+        try:
+            store_list_table.insert(r)
+        except:
+            traceback.print_exc()
+
+
+def process_web_request(db, driver, area_num, keyword):
+    global prev_cnt
+    request_url = None
+    time.sleep(0.8)
+    time.sleep(3)
+    print("ppppppppp&**********************")
+    for request in driver.requests:
+        if 'search?' in request.url :
+            print('searching.....')
+
+        if request.response:
+            if 'search?' in request.url :
+                print('parsing js:')
+                print(request.url)
+                resp = brotli.decompress(request.response.body)
+                jstext = resp.decode('utf-8')
+                resultobj = parsing_js(jstext)
+                print("before",datetime.now())
+                print("num: "+str(area_num))
+                save_js_to_db(resultobj, area_num, keyword)
+                print("after",datetime.now())
+
+                aft_cnt=0
+                cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
+                for c in cursor:
+                    aft_cnt=c['cnt']
+                    break
+                db[TABLE_CONV_LOG].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
+    del driver.requests
+
+
+def check_area_code(db, kw):
+    table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_AREACODES)
+    result = db.query('select distinct(kw) from {}'.format(table_name))
+    result = [i['kw'] for i in result]
+
+    if kw not in result:
+        try:
+            sql = 'insert into {} (select num,"{}" as kw,0 as expand from {}) '.format(table_name, kw, TABLE_LAT_LON)
+            db.query(sql) 
+        except:
+            traceback.print_exc()
+
+
+def page_down_(driver, time_):
+    try:
+        # action = webdriver.ActionChains(driver)
+        # element = driver.find_element_by_css_selector('a[aria-label="清除搜尋"]')
+        # print(element)
+        # height = element.size['height']
+        # width = element.size['width']
+        # action.move_to_element(element).move_by_offset(-width, height).click().perform()
+
+        action = webdriver.ActionChains(driver)
+        element = driver.find_element_by_css_selector('div[class="TFQHme"]')
+        action.move_to_element(element).click().perform()
+        time.sleep(1)
+        driver.back()
+        time.sleep(1)
+
+        for i in range(time_):
+            print(i)
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.END).perform()
+            
+            time.sleep(0.5)
+
+    except:
+        traceback.print_exc()
+
+
+
+def main():
+    global chrome_window
+    global store_list_table
+    global globalkw
+    global proxyport
+    global iddict
+    global prev_cnt
+
+    port=4447
+    if len(sys.argv)>1:
+        globalkw=sys.argv[1]
+        port=int(sys.argv[2])
+        proxyport=int(sys.argv[3])
+    print(globalkw, port, proxyport)
+    failcnt=0
+    localip=socket.gethostbyname(socket.gethostname())
+
+    db = dataset.connect('mysql://{}:{}@{}/{}?charset=utf8mb4'.format( MYSQL_CONFIG['MYSQL_USER'],
+        MYSQL_CONFIG['MYSQL_PASSWORD'], MYSQL_CONFIG['MYSQL_HOST'], MYSQL_CONFIG['MYSQL_DB']))
+    iddict = build_cache(db)
+    store_list_table = db[TABLE_STORE_LIST]
+    table2 = db[TABLE_PROGRESS_LIST]
+
+    if not chrome_window:
+        print('restart docker pw{}'.format(port))
+        os.system('sudo docker container restart pw'+str(port))
+        # os.system('docker container restart p'+str(port))
+
+        time.sleep(10)
+
+    print('drvier start...')
+    driver = brower_start(port)
+    # check_area_code(db, globalkw)
+    area_num=None
+    if len(sys.argv) > 4 :
+        repkw = sys.argv[1]
+        repnum = sys.argv[2]
+        if 'SCAN' in repkw:
+            job = scan_job(db, repnum)
+        else:
+            job = get_next_job(db, repeat=True, repkw=repkw, repnum=repnum)
+    else:
+        job = get_next_job(db, repkw=globalkw)
+    print(job)
+    keyword  = job['kw']
+    latitude = job['lat'] #緯度
+    longitude = job['lon'] #精度
+    area_num = job['num']
+
+    safe_string = urllib.parse.quote_plus(keyword)
+    url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+    print(url)
+    prev_cnt=0
+    cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
+    for c in cursor:
+        prev_cnt = c['cnt']
+        break
+    driver.get(url)
+    time.sleep(2)
+    keyin_keyword(driver, keyword)
+    page_down_(driver, 10)
+    process_web_request(db, driver, area_num, keyword)
+
+
+    table2.insert({'kw':keyword,'num':job['num']},['kw'])
+    db.query('update {} set expand = 1 where num="'.format(TABLE_AREACODES)+str(job['num'])+'" and kw="'+keyword+'" ')
+
+
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
utility/__init__.py


+ 13 - 0
utility/connect.py

@@ -0,0 +1,13 @@
+MYSQL_CONFIG = {
+    "MYSQL_HOST": "db.ptt.cx",
+    "MYSQL_USER": "choozmo",
+    "MYSQL_PASSWORD": "pAssw0rd",
+    "MYSQL_PORT": 3306,
+    "MYSQL_DB": "google_poi_v1",
+}
+TABLE_STORE_LIST = 'store_list'
+TABLE_PROGRESS_LIST = 'progress_list'
+TABLE_LAT_LON = 'lat_lon_loc'
+TABLE_CONV_LOG = 'conv_log'
+
+TABLE_AREACODES = 'areacodes'

+ 99 - 0
utility/database_access.py

@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+import dataset
+import pandas as pd
+import pymysql
+
+
+def check_unique(table, value):
+    #---if there is the same url, return 0---
+    if len([i for i in table.find(url=value)]) > 0:
+        return 0
+    else:
+        return 1
+    
+    
+def DBconnect():
+    db = dataset.connect('sqlite:///gs.db')
+    print('Have {} tables: {}'.format(len(db.tables), str(db.tables)))
+    
+    return db
+
+
+def Tabelconnect(db, table_name):
+    table=db[table_name]
+    print('Have {} columns: {}'.format(len(table.columns),str(table.columns)))
+    return table
+
+
+def DataToCsv(table, output_file_name):
+    db = dataset.connect('sqlite:///gs.db')
+    table=db[table]
+    data = pd.DataFrame([dict(i) for i in table])
+    print('output data size: {}'.format(len(data)))
+    data.to_csv(output_file_name, index=False)
+    
+    
+def mysql_connect(mysql_config, DB_name):
+    # Open database connection
+    db = pymysql.connect(
+            host = mysql_config['MYSQL_HOST'],
+            port = mysql_config['MYSQL_PORT'],
+            user = mysql_config['MYSQL_USER'],
+            password = mysql_config['MYSQL_PASSWORD'],
+            db = DB_name,
+    )
+    return db
+
+
+def create_table(db, table_name, schema):
+    cursor = db.cursor()
+    # Drop table if it already exist using execute() method.
+    cursor.execute("DROP TABLE IF EXISTS {}".format(table_name))
+
+    cursor.execute(schema)
+    print("Created table {} Successfull.".format(table_name))
+    
+    cursor.close()
+    
+    
+def mysql_data_version(db):
+    # prepare a cursor object using cursor() method
+    cursor = db.cursor()
+    # execute SQL query using execute() method.
+    cursor.execute("SELECT VERSION()")
+    # Fetch a single row using fetchone() method.
+    data = cursor.fetchone()
+    print ("Database version : %s " % data)
+    
+    
+def mysql_insert_data(db, insert_sql):
+    cursor = db.cursor()
+    try:
+        # Execute the SQL command
+        cursor.execute(insert_sql)
+        # Commit your changes in the database
+        db.commit()
+    except Exception as e:
+        print(e)
+        # Rollback in case there is any error
+        db.rollback()
+    cursor.close()
+    
+    
+def mysql_select_data(db, query_sql):
+    cursor = db.cursor()
+    # Prepare SQL query to select a record from the table.
+
+    try:
+        # Execute the SQL command
+        cursor.execute(query_sql)
+        # Fetch all the rows in a list of lists.
+        results = cursor.fetchall()
+
+    except:
+        import traceback
+        traceback.print_exc()
+
+        print ("Error: unable to fetch data")
+        
+    return results

+ 77 - 0
utility/parseutils.py

@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+DB_NAME = 'google_poi'
+SHOP_LIST_TABLE = 'shop_list3'
+SHOP_LIST_TABLE_COL = ['name', 'fid', 'city', 'area', 
+                      'rating', 'user_ratings_total', 'category', 'price_level', 
+                      'addr', 'tel', 'services', 'products', 'choices', 
+                      'facilities', 'groups', 'plans', 'payments', 'safeties', 
+                      'specials', 'barrierlevels', 'items' ,
+                      'open_now', 'periods', 'weekday_text','reviews',
+                      'shop_photo','menu_photo',
+                      'google_url', 'item_url', 'keyword', 'crawler_date']
+                      
+# SHOP_LIST_TABLE_COL = ['unique_id', 'name', 'lon', 'lat', 'city', 'area', 
+#                       'rating', 'user_ratings_total', 'category', 'price_level', 
+#                       'addr', 'tel', 'services', 'products', 'choices', 
+#                       'facilities', 'groups', 'plans', 'payments', 'safeties', 
+#                       'specials', 'barrierlevels', 'items' ,
+#                       'open_now', 'periods', 'weekday_text','reviews',
+#                       'shop_photo','menu_photo',
+#                       'google_url', 'item_url', 'keyword', 'crawler_date']
+
+element_list = {
+    'category': ['button', {'jsaction':'pane.rating.category'}],
+    'rating': ['ol', {}, 'aria-label'],
+    'user_ratings_total': ['span', {'jsaction':'pane.rating.moreReviews'}],
+    'price_level':['span', {'jsan':'0.aria-label'}]
+}
+
+
+intro_list = {
+    '服務選項': ['services','service'],
+    '產品/服務': ['products','product'],
+    '用餐選擇': ['choices','choice'],
+    '設施': ['facilities','facility'],
+    '客層族群':['groups','group'],
+    '規劃':['plans','plan'],
+    '付款方式':['payments','payment'],
+    '健康與安全':['safeties','safety'],
+    '特色':['specials','special'],
+    '無障礙程度':['barrierlevels','barrierlevel'],
+    '詳細資料':['items','item']
+}
+
+week_list = {
+    '星期日': 0, 
+    '星期一': 1, 
+    '星期二': 2, 
+    '星期三': 3,
+    '星期四': 4, 
+    '星期五': 5, 
+    '星期六': 6,  
+}
+
+
+def blank_check(value):
+    while value.startswith(' '):
+        value = value[1:]
+
+    while value.endswith(' '):
+        value = value[:-1]
+    return value
+
+
+def value_check(key, value):
+    value = blank_check(value)
+    if key == 'rating':
+        value = float(value.replace(' 星級',''))
+    elif key == 'user_ratings_total':
+        value = int(value.replace(' 則評論','').replace(',',''))
+    elif key == 'price_level':
+        value = len(['$' for i in value if i == '$'])
+        if value == 0:
+            value = ''
+        else:
+            value = int(value)
+
+    return value