jared 3 anos atrás
pai
commit
18c4e1510b
3 arquivos alterados com 290 adições e 0 exclusões
  1. 217 0
      swire_shop_item_list.py
  2. 19 0
      utility/json_parsing.py
  3. 54 0
      utility/json_preprocess.py

+ 217 - 0
swire_shop_item_list.py

@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+#from selenium import webdriver
+from seleniumwire import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+import selenium
+import traceback
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+
+from datetime import datetime
+import pandas as pd
+import dataset
+import time
+import json
+import re
+import sys, os
+import socket
+import brotli
+
+chrome_window=False
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+    if chrome_window:
+        browser = webdriver.Chrome(
+            desired_capabilities=options.to_capabilities()
+        )
+    else:
+        browser = webdriver.Remote(
+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+            desired_capabilities=options.to_capabilities()
+        )
+
+    return browser
+
+
+def page_down_(driver, xpath_css, time_):
+    e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
+    result_count = e.text.split('-')[1].replace(' 項結果','')
+    print(result_count)
+    if int(result_count) > 5:
+        for i in range(time_):
+            e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
+            action = webdriver.common.action_chains.ActionChains(driver)
+            action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
+            action.click()
+            action.perform()
+            time.sleep(0.5)
+
+
+def get_url_list(driver):
+    page_down_(driver, '//div[@class="TFQHme"]', 8)
+
+    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    url_list = []
+    for i in url_soup.find_all('a'):
+        try:
+            if i['href'].find('maps/place') != -1:
+                url_list += [[i['href'], i['aria-label']]]
+        except:
+            pass
+    # print(len(url_list))
+    return url_list
+
+
+def keyin_keyword(driver, keyword):
+    button = driver.find_element_by_id("searchbox")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
+    time.sleep(3)
+
+
+
+def get_crawler_list(db):
+#    result = db.query('select * from shop_item_list order by keyword')
+#    result = pd.DataFrame([i for i in result])
+#    result = result[~result.keyword.str.contains('項')]
+
+#    progress = db.query('select distinct(kw) from progress_list2 where num < 367')
+#    progress = pd.DataFrame([i for i in progress])
+
+#    if len(progress) != 0:
+#        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
+#    else:
+#        keyword = result.iloc[0]['keyword']
+#        
+#    return keyword
+    return '滷味'
+    cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
+    for c in cursor:
+        return c['kw']
+    return None
+
+def get_lon_lat_list(db, keyword):
+    num=0
+    cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
+    for c in cursor:
+        num=c['num']
+        break
+
+    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
+
+    lst=[]
+    for c in cursor:
+        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
+
+    return lst
+
+def write_to_file(jsobj,fname):
+    import codecs
+    fw=codecs.open(fname,'w','utf-8')
+    fw.write(str(jsobj))
+    fw.close()
+
+def parsing_js(orig):
+    content=""
+    lines=orig.split('\n')
+    for l in lines:
+        newl=l.replace('\\"','"')
+#        if '\\\\"' in newl:
+#            print(newl)
+#        newl=newl.repace('\\\\"','')
+        newl=newl.replace('\\"','"')
+
+        content+=newl
+    result=re.search(r'\[\["',content)
+    print(result)
+    content_begin=result.start()
+
+    result=re.search(r'\]\]"',content)
+    print(result)
+
+    content_end=result.end()
+
+    jscontent=content[content_begin:content_end-1]
+    write_to_file(jscontent,'c:/tmp/debug.txt')
+    jsobj=json.loads(jscontent)
+    for x in jsobj[0][1][1:]:
+        print(x[14][11])
+        print(x[14][10])
+        print(x[14][2])
+        print(x[14][78])
+
+
+
+def main():
+    global chrome_window
+    localip=socket.gethostbyname(socket.gethostname())
+    if localip=='192.168.1.108':
+        chrome_window=True
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    table = db['shop_item_list3']
+    table2 = db['progress_list2']
+
+    port=4447
+    if len(sys.argv) > 1 :
+        port=int(sys.argv[1])
+        print('restart docker p{}'.format(port))
+        os.system('sudo docker container restart p'+str(port))
+        time.sleep(8)
+
+    print('drvier start...')
+    driver = brower_start(port)
+    
+
+    for i in range(10):
+        try:
+            keyword  = get_crawler_list(db)
+            print(keyword)
+            lst = get_lon_lat_list(db, keyword)
+#            print(lst)
+            print(keyword, len(lst))
+
+            for r in lst:
+                latitude = r['lat'] #緯度
+                longitude = r['lon'] #精度
+                area_num=r['num']
+                table2.upsert({'kw':keyword,'num':r['num']},['kw'])
+
+                url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+                driver.get(url)
+                keyin_keyword(driver, keyword)
+                failcnt = 0
+
+#                query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
+                time.sleep(11)
+                print("ppppppppp&**********************")
+                for request in driver.requests:
+                    if request.response:
+                        if 'https://www.google.com.tw/search?tbm=map' in request.url :
+                            print('parsing js:')
+                            resp = brotli.decompress(request.response.body)
+                            jstext=resp.decode('utf-8')
+                            parsing_js(jstext)
+
+
+
+                for page in range(10):
+                    if page < 2 :
+                        element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                        if element.get_attribute('disabled'):
+                            break
+                        driver.implicitly_wait(30)
+                        ActionChains(driver).move_to_element(element).click(element).perform() 
+        except:
+            pass
+
+
+if __name__ == '__main__':
+    main()

+ 19 - 0
utility/json_parsing.py

@@ -0,0 +1,19 @@
+import codecs
+import re
+import json
+
+fr=codecs.open('c:/tmp/debug.txt','r','utf-8')
+lines=fr.readlines()
+content=""
+for l in lines:
+    content+=l
+
+print(content)
+jsobj=json.loads(content)
+for x in jsobj[0][1][1:]:
+#    print(x[14][0])
+    print(x[14][11])
+    print(x[14][10])
+    print(x[14][2])
+    print(x[14][78])
+

+ 54 - 0
utility/json_preprocess.py

@@ -0,0 +1,54 @@
+import codecs
+import re
+import json
+fw=codecs.open('c:/tmp/result.json','w','utf-8')
+
+content=""
+fr=codecs.open('c:/tmp/ot.json','r','utf-8')
+lines=fr.readlines()
+
+
+
+for l in lines:
+    newl=l.replace('\\"','"')
+    content+=newl
+result=re.search(r'\[\["',content)
+content_begin=result.start()
+
+result=re.search(r'\]\]"',content)
+content_end=result.end()
+
+
+#print(result.span(1))
+#print(result.span(2))
+
+#for r in result:
+#   print(r)
+#r=result[0]
+#print(r)
+#print(result)
+fw.write(content[content_begin:content_end-1])
+fw.close()
+
+jscontent=content[content_begin:content_end-1]
+jsobj=json.loads(jscontent)
+print()
+for x in jsobj[0][1][1:]:
+#    print(x[14][0])
+    print(x[14][11])
+    print(x[14][10])
+    print(x[14][2])
+    print(x[14][78])
+
+#    cnt=0
+#    for xx in x[14]:
+#        cnt+=1
+#        if 'ChIJ' in str(xx):
+#            print(xx)
+#            print(cnt)
+
+#    print(x[14][9])
+#    try:
+#        print(x[14][4][3])
+#    except:
+#        print(x[14][4])