3 anos atrás · 18c4e1510b
--- a/swire_shop_item_list.py
+++ b/swire_shop_item_list.py
@@ -0,0 +1,217 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+#from selenium import webdriver
			
 
				+from seleniumwire import webdriver
			
 
				+from selenium.webdriver.common.action_chains import ActionChains
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.webdriver.support.wait import WebDriverWait
			
 
				+from selenium.webdriver.common.by import By
			
 
				+import selenium
			
 
				+import traceback
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+from utility import database_access as DA
			
 
				+from utility.parseutils import *
			
 
				+from utility.connect import *
			
 
				+
			
 
				+from datetime import datetime
			
 
				+import pandas as pd
			
 
				+import dataset
			
 
				+import time
			
 
				+import json
			
 
				+import re
			
 
				+import sys, os
			
 
				+import socket
			
 
				+import brotli
			
 
				+
			
 
				+chrome_window=False
			
 
				+
			
 
				+def brower_start(port):
			
 
				+    options = webdriver.ChromeOptions()
			
 
				+    if chrome_window:
			
 
				+        browser = webdriver.Chrome(
			
 
				+            desired_capabilities=options.to_capabilities()
			
 
				+        )
			
 
				+    else:
			
 
				+        browser = webdriver.Remote(
			
 
				+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
			
 
				+            desired_capabilities=options.to_capabilities()
			
 
				+        )
			
 
				+
			
 
				+    return browser
			
 
				+
			
 
				+
			
 
				+def page_down_(driver, xpath_css, time_):
			
 
				+    e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
			
 
				+    result_count = e.text.split('-')[1].replace(' 項結果','')
			
 
				+    print(result_count)
			
 
				+    if int(result_count) > 5:
			
 
				+        for i in range(time_):
			
 
				+            e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
			
 
				+            action = webdriver.common.action_chains.ActionChains(driver)
			
 
				+            action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
			
 
				+            action.click()
			
 
				+            action.perform()
			
 
				+            time.sleep(0.5)
			
 
				+
			
 
				+
			
 
				+def get_url_list(driver):
			
 
				+    page_down_(driver, '//div[@class="TFQHme"]', 8)
			
 
				+
			
 
				+    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				+    url_list = []
			
 
				+    for i in url_soup.find_all('a'):
			
 
				+        try:
			
 
				+            if i['href'].find('maps/place') != -1:
			
 
				+                url_list += [[i['href'], i['aria-label']]]
			
 
				+        except:
			
 
				+            pass
			
 
				+    # print(len(url_list))
			
 
				+    return url_list
			
 
				+
			
 
				+
			
 
				+def keyin_keyword(driver, keyword):
			
 
				+    button = driver.find_element_by_id("searchbox")
			
 
				+    driver.implicitly_wait(30)
			
 
				+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
			
 
				+    time.sleep(3)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def get_crawler_list(db):
			
 
				+#    result = db.query('select * from shop_item_list order by keyword')
			
 
				+#    result = pd.DataFrame([i for i in result])
			
 
				+#    result = result[~result.keyword.str.contains('項')]
			
 
				+
			
 
				+#    progress = db.query('select distinct(kw) from progress_list2 where num < 367')
			
 
				+#    progress = pd.DataFrame([i for i in progress])
			
 
				+
			
 
				+#    if len(progress) != 0:
			
 
				+#        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
			
 
				+#    else:
			
 
				+#        keyword = result.iloc[0]['keyword']
			
 
				+#        
			
 
				+#    return keyword
			
 
				+    return '滷味'
			
 
				+    cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
			
 
				+    for c in cursor:
			
 
				+        return c['kw']
			
 
				+    return None
			
 
				+
			
 
				+def get_lon_lat_list(db, keyword):
			
 
				+    num=0
			
 
				+    cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
			
 
				+    for c in cursor:
			
 
				+        num=c['num']
			
 
				+        break
			
 
				+
			
 
				+    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
			
 
				+
			
 
				+    lst=[]
			
 
				+    for c in cursor:
			
 
				+        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
			
 
				+
			
 
				+    return lst
			
 
				+
			
 
				+def write_to_file(jsobj,fname):
			
 
				+    import codecs
			
 
				+    fw=codecs.open(fname,'w','utf-8')
			
 
				+    fw.write(str(jsobj))
			
 
				+    fw.close()
			
 
				+
			
 
				+def parsing_js(orig):
			
 
				+    content=""
			
 
				+    lines=orig.split('\n')
			
 
				+    for l in lines:
			
 
				+        newl=l.replace('\\"','"')
			
 
				+#        if '\\\\"' in newl:
			
 
				+#            print(newl)
			
 
				+#        newl=newl.repace('\\\\"','')
			
 
				+        newl=newl.replace('\\"','"')
			
 
				+
			
 
				+        content+=newl
			
 
				+    result=re.search(r'\[\["',content)
			
 
				+    print(result)
			
 
				+    content_begin=result.start()
			
 
				+
			
 
				+    result=re.search(r'\]\]"',content)
			
 
				+    print(result)
			
 
				+
			
 
				+    content_end=result.end()
			
 
				+
			
 
				+    jscontent=content[content_begin:content_end-1]
			
 
				+    write_to_file(jscontent,'c:/tmp/debug.txt')
			
 
				+    jsobj=json.loads(jscontent)
			
 
				+    for x in jsobj[0][1][1:]:
			
 
				+        print(x[14][11])
			
 
				+        print(x[14][10])
			
 
				+        print(x[14][2])
			
 
				+        print(x[14][78])
			
 
				+
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    global chrome_window
			
 
				+    localip=socket.gethostbyname(socket.gethostname())
			
 
				+    if localip=='192.168.1.108':
			
 
				+        chrome_window=True
			
 
				+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
			
 
				+    table = db['shop_item_list3']
			
 
				+    table2 = db['progress_list2']
			
 
				+
			
 
				+    port=4447
			
 
				+    if len(sys.argv) > 1 :
			
 
				+        port=int(sys.argv[1])
			
 
				+        print('restart docker p{}'.format(port))
			
 
				+        os.system('sudo docker container restart p'+str(port))
			
 
				+        time.sleep(8)
			
 
				+
			
 
				+    print('drvier start...')
			
 
				+    driver = brower_start(port)
			
 
				+    
			
 
				+
			
 
				+    for i in range(10):
			
 
				+        try:
			
 
				+            keyword  = get_crawler_list(db)
			
 
				+            print(keyword)
			
 
				+            lst = get_lon_lat_list(db, keyword)
			
 
				+#            print(lst)
			
 
				+            print(keyword, len(lst))
			
 
				+
			
 
				+            for r in lst:
			
 
				+                latitude = r['lat'] #緯度
			
 
				+                longitude = r['lon'] #精度
			
 
				+                area_num=r['num']
			
 
				+                table2.upsert({'kw':keyword,'num':r['num']},['kw'])
			
 
				+
			
 
				+                url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
			
 
				+                driver.get(url)
			
 
				+                keyin_keyword(driver, keyword)
			
 
				+                failcnt = 0
			
 
				+
			
 
				+#                query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
			
 
				+                time.sleep(11)
			
 
				+                print("ppppppppp&**********************")
			
 
				+                for request in driver.requests:
			
 
				+                    if request.response:
			
 
				+                        if 'https://www.google.com.tw/search?tbm=map' in request.url :
			
 
				+                            print('parsing js:')
			
 
				+                            resp = brotli.decompress(request.response.body)
			
 
				+                            jstext=resp.decode('utf-8')
			
 
				+                            parsing_js(jstext)
			
 
				+
			
 
				+
			
 
				+
			
 
				+                for page in range(10):
			
 
				+                    if page < 2 :
			
 
				+                        element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
			
 
				+                        if element.get_attribute('disabled'):
			
 
				+                            break
			
 
				+                        driver.implicitly_wait(30)
			
 
				+                        ActionChains(driver).move_to_element(element).click(element).perform() 
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/utility/json_parsing.py
+++ b/utility/json_parsing.py
@@ -0,0 +1,19 @@
 
				+import codecs
			
 
				+import re
			
 
				+import json
			
 
				+
			
 
				+fr=codecs.open('c:/tmp/debug.txt','r','utf-8')
			
 
				+lines=fr.readlines()
			
 
				+content=""
			
 
				+for l in lines:
			
 
				+    content+=l
			
 
				+
			
 
				+print(content)
			
 
				+jsobj=json.loads(content)
			
 
				+for x in jsobj[0][1][1:]:
			
 
				+#    print(x[14][0])
			
 
				+    print(x[14][11])
			
 
				+    print(x[14][10])
			
 
				+    print(x[14][2])
			
 
				+    print(x[14][78])
			
 
				+
			
--- a/utility/json_preprocess.py
+++ b/utility/json_preprocess.py
@@ -0,0 +1,54 @@
 
				+import codecs
			
 
				+import re
			
 
				+import json
			
 
				+fw=codecs.open('c:/tmp/result.json','w','utf-8')
			
 
				+
			
 
				+content=""
			
 
				+fr=codecs.open('c:/tmp/ot.json','r','utf-8')
			
 
				+lines=fr.readlines()
			
 
				+
			
 
				+
			
 
				+
			
 
				+for l in lines:
			
 
				+    newl=l.replace('\\"','"')
			
 
				+    content+=newl
			
 
				+result=re.search(r'\[\["',content)
			
 
				+content_begin=result.start()
			
 
				+
			
 
				+result=re.search(r'\]\]"',content)
			
 
				+content_end=result.end()
			
 
				+
			
 
				+
			
 
				+#print(result.span(1))
			
 
				+#print(result.span(2))
			
 
				+
			
 
				+#for r in result:
			
 
				+#   print(r)
			
 
				+#r=result[0]
			
 
				+#print(r)
			
 
				+#print(result)
			
 
				+fw.write(content[content_begin:content_end-1])
			
 
				+fw.close()
			
 
				+
			
 
				+jscontent=content[content_begin:content_end-1]
			
 
				+jsobj=json.loads(jscontent)
			
 
				+print()
			
 
				+for x in jsobj[0][1][1:]:
			
 
				+#    print(x[14][0])
			
 
				+    print(x[14][11])
			
 
				+    print(x[14][10])
			
 
				+    print(x[14][2])
			
 
				+    print(x[14][78])
			
 
				+
			
 
				+#    cnt=0
			
 
				+#    for xx in x[14]:
			
 
				+#        cnt+=1
			
 
				+#        if 'ChIJ' in str(xx):
			
 
				+#            print(xx)
			
 
				+#            print(cnt)
			
 
				+
			
 
				+#    print(x[14][9])
			
 
				+#    try:
			
 
				+#        print(x[14][4][3])
			
 
				+#    except:
			
 
				+#        print(x[14][4])