noodles 2 years ago
parent
commit
db8617102e
1 changed files with 32 additions and 24 deletions
  1. 32 24
      run4.py

+ 32 - 24
run4.py

@@ -39,19 +39,15 @@ db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_rev
 def write_to_file(jsobj,fname):
     with open(fname, 'wb') as handle:
         pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
-    # import codecs
-    # fw=codecs.open(fname,'w','utf-8')
-    # fw.write(str(jsobj))
-    # fw.close()
 
 
 def build_cache(db):
     global reviews_table
     id_dict={}
-    cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
+    cursor = db.query('SELECT fid FROM google_poi.shop_list3;')
 
     for c in cursor:
-        key = '{}_{}'.format(c['fid'],c['author_id'])
+        key = '{}'.format(c['fid'])
         id_dict[key]=1
     return id_dict
 
@@ -83,9 +79,9 @@ def brower_start(port):
 
 def get_next_job(db):
     result = {}
-    result = db.query('select * from error_list2 ORDER BY RAND() limit 2')
+    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 5')
     url_pd = pd.DataFrame([dict(i) for i in result])
-
+    url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
     # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
 
     # remove = db.query('select fid from review_process')
@@ -104,6 +100,7 @@ def parsing_js(resp, db_name):
 
     if txt[6][11] != db_name:
         return 0
+
     output['name'] = txt[6][11]
     output['adress_name'] = txt[6][18]
 
@@ -259,16 +256,13 @@ def time_parsing_js(time_json, output):
 
 
 def save_js_to_db(jsobj, fid):
-    global reviews_table
+    global shop_table
     global iddict
-    for r in jsobj:
-        r['fid'] = fid
-        key = '{}_{}'.format(r['fid'], r['author_id'])
-        if iddict.get(key) is not None:
-            continue
+  
+    jsobj['fid'] = fid
+    if iddict.get(fid) is None:
         try:
-            r['review_image'] = str(r['review_image'])
-            reviews_table.insert(r)
+            shop_table.insert(jsobj)
         except:
             traceback.print_exc()
 
@@ -286,8 +280,11 @@ def process_web_request_start(driver, db_name):
                 resp = brotli.decompress(request.response.body)
                 jstext = resp.decode('utf-8')
                 output = parsing_js(jstext, db_name)
-                time.sleep(1)
-                return output
+                if output  == 0:
+                    continue
+                else:
+                    time.sleep(1)
+                    return output
     return 0
 
 
@@ -365,7 +362,6 @@ def photos_parsing_js(resp, c):
     return menu, all
     
 
-
 def process_web_request_photo(driver, output):
     try:
         driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
@@ -420,7 +416,7 @@ def process_web_request_photo(driver, output):
 def main():
     global chrome_window
     global store_list_table
-    global reviews_table
+    global shop_table
     global proxyport
     global iddict
 
@@ -429,6 +425,7 @@ def main():
     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
     store_list_table = db['swire_store_list']
     shop_table = db['shop_list4']
+    error_table = db['error_list2']
 
     iddict=build_cache(db)
     
@@ -454,13 +451,14 @@ def main():
             name = group['name']
             num = group['num']
             keyword = group['keyword']
+            fid = group['fid']
 
             if name:
                 db_name = name
             else:
                 db_name = num
 
-            print(name, num, keyword, db_name)
+            print(fid, name, num, keyword, db_name)
             print(item_url)
 
             #shop_info
@@ -468,9 +466,14 @@ def main():
             for i in range(5):
                 print('shop info try...{}'.format(i))
                 driver.get(item_url)
-                time.sleep(2)
+                driver.refresh()
+                time.sleep(5)
 
-                element = driver.find_element_by_id('searchbox-searchbutton')
+                wait = WebDriverWait(driver, 10)
+                wait.until(
+                    EC.element_to_be_clickable((By.ID, 'sb_cb50'))
+                )
+                element = driver.find_element_by_id('sb_cb50')
                 driver.implicitly_wait(10)
                 ActionChains(driver).move_to_element(element).click(element).perform()
                 time.sleep(5)
@@ -538,8 +541,13 @@ def main():
             output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
             output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
 
-            shop_table.insert(output,['item_url'])
+            # shop_table.insert(output,['item_url'])
+
+            save_js_to_db(output, fid)
+            error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
         except:
+            error_table3 = db['error_list3']
+            error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
             traceback.print_exc()
 
 if __name__ == '__main__':