noodles 2 years ago
parent
commit
c88b9c6b19
1 changed files with 57 additions and 51 deletions
  1. 57 51
      shop_item_crawler.py

+ 57 - 51
shop_item_crawler.py

@@ -16,13 +16,10 @@ from utility.connect import *
 from datetime import datetime
 import pandas as pd
 import dataset
-import requests
-import time
-import json
-import re
-import sys, os
-import socket
-import brotli
+import requests, random, time, json
+import re, sys, os
+import socket, brotli
+
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 import urllib.parse
 chrome_window=False
@@ -280,16 +277,17 @@ def process_web_request(db, driver, area_num, keyword):
 
 
 def check_area_code(db, kw):
-    table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_AREACODES)
-    result = db.query('select distinct(kw) from {}'.format(table_name))
-    result = [i['kw'] for i in result]
+    if kw:
+        table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_AREACODES)
+        result = db.query('select distinct(kw) from {}'.format(table_name))
+        result = [i['kw'] for i in result]
 
-    if kw not in result:
-        try:
-            sql = 'insert into {} (select num,"{}" as kw,0 as expand from {}) '.format(table_name, kw, TABLE_LAT_LON)
-            db.query(sql) 
-        except:
-            traceback.print_exc()
+        if kw not in result:
+            try:
+                sql = 'insert into {} (select num,"{}" as kw, 0 as expand from {}) '.format(table_name, kw, TABLE_LAT_LON)
+                db.query(sql) 
+            except:
+                traceback.print_exc()
 
 
 def page_down_(driver, time_):
@@ -339,7 +337,6 @@ def main():
 
     db = dataset.connect('mysql://{}:{}@{}/{}?charset=utf8mb4'.format( MYSQL_CONFIG['MYSQL_USER'],
         MYSQL_CONFIG['MYSQL_PASSWORD'], MYSQL_CONFIG['MYSQL_HOST'], MYSQL_CONFIG['MYSQL_DB']))
-    iddict = build_cache(db)
     store_list_table = db[TABLE_STORE_LIST]
     table2 = db[TABLE_PROGRESS_LIST]
 
@@ -352,41 +349,50 @@ def main():
 
     print('drvier start...')
     driver = brower_start(port)
-    # check_area_code(db, globalkw)
-    area_num=None
-    if len(sys.argv) > 4 :
-        repkw = sys.argv[1]
-        repnum = sys.argv[2]
-        if 'SCAN' in repkw:
-            job = scan_job(db, repnum)
-        else:
-            job = get_next_job(db, repeat=True, repkw=repkw, repnum=repnum)
-    else:
+    check_area_code(db, globalkw)
+    for i in range(2):
+        area_num=None
+        # if len(sys.argv) > 4 :
+        #     repkw = sys.argv[1]
+        #     repnum = sys.argv[2]
+        #     if 'SCAN' in repkw:
+        #         job = scan_job(db, repnum)
+        #     else:
+        #         job = get_next_job(db, repeat=True, repkw=repkw, repnum=repnum)
+        # else:
         job = get_next_job(db, repkw=globalkw)
-    print(job)
-    keyword  = job['kw']
-    latitude = job['lat'] #緯度
-    longitude = job['lon'] #精度
-    area_num = job['num']
-
-    safe_string = urllib.parse.quote_plus(keyword)
-    url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
-    print(url)
-    prev_cnt=0
-    cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
-    for c in cursor:
-        prev_cnt = c['cnt']
-        break
-    driver.get(url)
-    time.sleep(2)
-    keyin_keyword(driver, keyword)
-    page_down_(driver, 10)
-    process_web_request(db, driver, area_num, keyword)
-
-
-    table2.insert({'kw':keyword,'num':job['num']},['kw'])
-    db.query('update {} set expand = 1 where num="'.format(TABLE_AREACODES)+str(job['num'])+'" and kw="'+keyword+'" ')
-
+        print(job)
+
+        keyword  = job['kw']
+        globalkw = keyword
+        latitude = job['lat'] #緯度
+        longitude = job['lon'] #精度
+        area_num = job['num']
+
+        safe_string = urllib.parse.quote_plus(keyword)
+        for j in range(5):
+            iddict = build_cache(db)
+            if j != 0:
+                latitude_ = float(latitude) + (random.randint(-999,999) / 10000)
+                longitude_ = float(longitude) + (random.randint(-999,999) / 10000)
+            else:
+                latitude_, longitude_ = latitude, longitude
+            url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude_, longitude_)
+            print(url)
+            prev_cnt=0
+            cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
+            for c in cursor:
+                prev_cnt = c['cnt']
+                break
+            driver.get(url)
+            time.sleep(2)
+            keyin_keyword(driver, keyword)
+            # page_down_(driver, 3)
+            process_web_request(db, driver, area_num, keyword)
+            time.sleep(1)
+
+        table2.insert({'kw':keyword,'num':job['num']},['kw'])
+        db.query(f'update {TABLE_AREACODES} set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
 
 
 if __name__ == '__main__':