Jared 3 년 전
부모
커밋
587ba5f94a
2개의 변경된 파일443개의 추가작업 그리고 1개의 파일을 삭제
  1. 441 0
      swire_docker_itemlist.py
  2. 2 1
      swire_shop_item_list.py

+ 441 - 0
swire_docker_itemlist.py

@@ -0,0 +1,441 @@
+# -*- coding: utf-8 -*-
+#from selenium import webdriver
+from seleniumwire import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+import selenium
+import traceback
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+
+from datetime import datetime
+import pandas as pd
+import dataset
+import time
+import json
+import re
+import sys, os
+import socket
+import brotli
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import urllib.parse
+#chrome_window=False
+chrome_window=True
+
+globalkw=None
+proxyport=8787
+
+
+def build_cache(db):
+    id_dict={}
+    cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;')
+
+    for c in cursor:
+        id_dict[c['place_id']]=1
+    return id_dict
+    #
+
+def brower_start(port):
+    global proxyport
+    global chrome_window
+    print(proxyport)
+    options = webdriver.ChromeOptions()
+    if chrome_window:
+        options.add_argument('--ignore-certificate-errors')
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless")
+        options.add_argument("--disable-dev-shm-usage")
+
+        browser = webdriver.Chrome(
+            desired_capabilities=options.to_capabilities()
+        )
+    else:
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
+        chrome_options.add_argument('--ignore-certificate-errors')
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        browser = webdriver.Remote(
+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+            desired_capabilities=chrome_options.to_capabilities(),
+            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
+
+            )
+#            seleniumwire_options = {'addr': '172.17.0.2','port':4444})
+        browser.set_window_size(1400,1000)
+    return browser
+
+
+def page_down_(driver, xpath_css, time_):
+    e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
+    result_count = e.text.split('-')[1].replace(' 項結果','')
+    print(result_count)
+    if int(result_count) > 5:
+        for i in range(time_):
+            e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
+            action = webdriver.common.action_chains.ActionChains(driver)
+            action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
+            action.click()
+            action.perform()
+            time.sleep(0.5)
+
+
+def get_url_list(driver):
+    page_down_(driver, '//div[@class="TFQHme"]', 8)
+
+    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    url_list = []
+    for i in url_soup.find_all('a'):
+        try:
+            if i['href'].find('maps/place') != -1:
+                url_list += [[i['href'], i['aria-label']]]
+        except:
+            pass
+    # print(len(url_list))
+    return url_list
+
+
+def keyin_keyword(driver, keyword):
+    button = driver.find_element_by_id("searchbox")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
+    time.sleep(3)
+
+
+def scan_job(db,kw):
+    result={'kw':kw}
+    cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid  FROM google_poi.conv_log group by num  ) t2 where t1.id=t2.mid having diff>0 order by rand()')
+    for c in cursor:
+        result['num']=c['num']
+        break
+    cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
+    for c in cursor:
+        result['lat']=c['lat']
+        result['lon']=c['lon']
+        result['loc']=c['loc']
+        return result
+
+def get_next_job(db,repeat=False,repkw=None,repnum=None):
+    global globalkw
+
+    result={}
+#    if globalkw is not None:
+#        cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"')
+#    else:
+#        cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')
+
+#    cursor = db.query('select kw,num  from areacodes where expand=0 order by rand()')
+    cursor = db.query('select kw,num  from areacodes order by rand()')
+
+    for c in cursor:
+#        repkw=c['kw']
+        if repkw is None:
+            repkw=c['kw']
+        result['kw']=c['kw']
+        result['num']=c['num']
+        break
+    if repkw is not None:
+        result['kw']=repkw
+    if result.get('num') is not None:
+        cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
+        for c in cursor:
+            result['lat']=c['lat']
+            result['lon']=c['lon']
+            result['loc']=c['loc']
+            break
+
+    if repeat and repkw!= 'REP':
+        result['kw']=repkw
+        result['num']=repnum
+
+    if 'REP' in repkw:
+        if repnum=='REP':
+            repnum=None
+#            cursor = db.query('select  num from swire_store_list where num not in (select num from conv_log) order by rand() limit 1')
+
+            cursor = db.query('select  num from swire_store_list  order by rand() limit 1')
+            for c in cursor:
+                repnum=c['num']
+                break
+        if repnum is None:
+            cursor = db.query('select  num from swire_store_list  order by rand() limit 1')
+            for c in cursor:
+                repnum=c['num']
+                break
+
+
+#        cursor = db.query('select  lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1')
+        cursor = db.query('select  lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1')
+
+        for c in cursor:
+            result['kw']=c['keyword']
+            result['num']=c['num']
+            result['lat']=c['lat_txt']
+            result['lon']=c['lon_txt']
+            result['loc']=''
+            return result
+    
+
+
+    if repeat:
+#        cursor = db.query('select  lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
+        cursor = db.query('select  lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
+
+        for c in cursor:
+            result['kw']=c['keyword']
+            result['lat']=c['lat_txt']
+            result['lon']=c['lon_txt']
+
+    return result
+
+
+def write_to_file(jsobj,fname):
+    import codecs
+    fw=codecs.open(fname,'w','utf-8')
+    fw.write(str(jsobj))
+    fw.close()
+
+def parsing_js(orig):
+    resultobj=[]
+    content=""
+    lines=orig.split('\n')
+    for l in lines:
+        newl=l.replace('\\"','"')
+#        if '\\\\"' in newl:
+#            print(newl)
+#        newl=newl.repace('\\\\"','')
+        newl=newl.replace('\\"','"')
+
+        content+=newl
+    result=re.search(r'\[\["',content)
+    print(result)
+    content_begin=result.start()
+
+    result=re.search(r'\]\]"',content)
+    print(result)
+
+    content_end=result.end()
+
+    jscontent=content[content_begin:content_end-1]
+#    write_to_file(jscontent,'c:/tmp/debug.txt')
+    jsobj=json.loads(jscontent)
+    for x in jsobj[0][1][1:]:
+        print(x[14][11])
+        print(x[14][9])
+        reviews_cnt=None
+        photo=None
+        rating=None
+        biz_id=None
+        loc_x=None
+        loc_y=None
+        addr_elmts=None
+        tel=None
+        try:
+            rating=x[14][4][7]
+            reviews_cnt=x[14][4][8]
+        except:
+            traceback.print_exc()
+
+        try:
+            photo=x[14][37][0][0][0]
+            num_photos=x[14][37][0][0][6][1]
+        except:
+            traceback.print_exc()
+
+        try:
+            loc_x=x[14][37][0][0][29][0]
+            loc_y=x[14][37][0][0][29][1]
+        except:
+            traceback.print_exc()
+
+        try:
+            biz_id=x[14][57][2]
+            tel=x[14][178][0][3]
+        except:
+            traceback.print_exc()
+
+        try:
+            addr_elmts=str(x[14][82])
+        except:
+            traceback.print_exc()
+
+
+
+
+        category=str(x[14][13])
+        topic=str(x[14][89])
+        print(x[14][13])
+
+        print(x[14][10])
+        print(x[14][2])
+        print(x[14][78])
+        try:
+            resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+        except:
+            traceback.print_exc()
+    return resultobj
+
+def save_js_to_db(jsobj,num,keyword):
+    global store_list_table
+    global iddict
+    for r in jsobj:
+        if iddict.get(r['place_id']) is not None:
+            continue
+        r['num']=num
+        r['keyword']=keyword
+
+        try:
+            store_list_table.insert(r)
+
+#            store_list_table.upsert(r,keys=['place_id'])
+        except:
+            traceback.print_exc()
+#        store_list_table.upsert(r,keys=['place_id'])
+
+def process_web_request(db,driver,area_num,keyword):
+    global prev_cnt
+#    query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
+    time.sleep(0.8)
+    time.sleep(3)
+    print("ppppppppp&**********************")
+    for request in driver.requests:
+        if 'search?' in request.url :
+            print('searching.....')
+#        else:
+#            print(request.url[20:60])
+        if request.response:
+#            if 'https://www.google.com.tw/search?tbm=map' in request.url :
+            if 'search?' in request.url :
+                print('parsing js:')
+                resp = brotli.decompress(request.response.body)
+                jstext=resp.decode('utf-8')
+                resultobj=parsing_js(jstext)
+                print("before",datetime.now())
+                print("num: "+str(area_num))
+                save_js_to_db(resultobj,area_num,keyword)
+                print("after",datetime.now())
+
+                aft_cnt=0
+                cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
+                for c in cursor:
+                    aft_cnt=c['cnt']
+                    break
+                db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
+
+
+
+#    time.sleep(9999)
+
+
+def main():
+    global chrome_window
+    global store_list_table
+    global globalkw
+    global proxyport
+    global iddict
+    global prev_cnt
+
+    port=4444
+    # if len(sys.argv) == 3 :
+    #     port=int(sys.argv[1])
+    #     proxyport=int(sys.argv[2])
+
+    if len(sys.argv)>1:
+        globalkw=sys.argv[1]
+        port=int(sys.argv[2])
+        proxyport=int(sys.argv[3])
+    print(globalkw, port, proxyport)
+    failcnt=0
+    localip=socket.gethostbyname(socket.gethostname())
+#    if localip=='192.168.1.108':
+#        chrome_window=True
+#        chrome_window=False
+
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    iddict=build_cache(db)
+    store_list_table = db['swire_store_list']
+
+#    table2 = db['swire_progress_list']
+    table2 = db['swire_area_progress']
+
+    if not chrome_window:
+        print('restart docker p{}'.format(port))
+#        os.system('sudo docker container restart p'+str(port))
+        os.system('docker container restart p'+str(port))
+
+        time.sleep(10)
+
+    print('drvier start...')
+    driver = brower_start(port)
+    
+    area_num=None
+    while True:
+        try:
+            if len(sys.argv) > 4 :
+                repkw=sys.argv[1]
+                repnum=sys.argv[2]
+                if 'SCAN' in repkw:
+                    job=scan_job(db,repnum)
+                else:
+                    job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum)
+            else:
+                job=get_next_job(db, repkw=globalkw)
+            print(job)
+            keyword  = job['kw']
+            latitude = job['lat'] #緯度
+            longitude = job['lon'] #精度
+            area_num=job['num']
+            safe_string = urllib.parse.quote_plus(keyword)
+            url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
+
+            prev_cnt=0
+            cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
+            for c in cursor:
+                prev_cnt=c['cnt']
+                break
+
+
+#            url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
+#            url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
+#            print(url)
+#            url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
+            driver.get(url)
+#            time.sleep(3)
+            keyin_keyword(driver, keyword)
+
+            process_web_request(db,driver,area_num,keyword)
+
+            pagecnt=0
+            while True:
+                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                if element.get_attribute('disabled'):
+                    break
+    #               driver.implicitly_wait(30)
+                ActionChains(driver).move_to_element(element).click(element).perform() 
+                process_web_request(db,driver,area_num,keyword)
+                pagecnt+=1
+                if pagecnt>=5:
+                    break
+
+
+#            table2.upsert({'kw':keyword,'num':job['num']},['kw'])
+            table2.insert({'kw':keyword,'num':job['num']},['kw'])
+            db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
+
+        except:
+            traceback.print_exc()
+            failcnt+=1
+            if failcnt>=15:
+                sys.exit()
+            pass
+
+
+
+if __name__ == '__main__':
+    main()

+ 2 - 1
swire_shop_item_list.py

@@ -180,9 +180,10 @@ def get_next_job(db,repeat=False,repkw=None,repnum=None):
 
     if repeat:
 #        cursor = db.query('select  lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
-        cursor = db.query('select  lat_txt,lon_txt from swire_store_list order by rand() limit 1')
+        cursor = db.query('select  lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
 
         for c in cursor:
+            result['kw']=c['keyword']
             result['lat']=c['lat_txt']
             result['lon']=c['lon_txt']