|
@@ -39,19 +39,15 @@ db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_rev
|
|
|
def write_to_file(jsobj,fname):
|
|
|
with open(fname, 'wb') as handle:
|
|
|
pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
- # import codecs
|
|
|
- # fw=codecs.open(fname,'w','utf-8')
|
|
|
- # fw.write(str(jsobj))
|
|
|
- # fw.close()
|
|
|
|
|
|
|
|
|
def build_cache(db):
|
|
|
global reviews_table
|
|
|
id_dict={}
|
|
|
- cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
|
|
|
+ cursor = db.query('SELECT fid FROM google_poi.shop_list3;')
|
|
|
|
|
|
for c in cursor:
|
|
|
- key = '{}_{}'.format(c['fid'],c['author_id'])
|
|
|
+ key = '{}'.format(c['fid'])
|
|
|
id_dict[key]=1
|
|
|
return id_dict
|
|
|
|
|
@@ -83,9 +79,9 @@ def brower_start(port):
|
|
|
|
|
|
def get_next_job(db):
|
|
|
result = {}
|
|
|
- result = db.query('select * from error_list2 ORDER BY RAND() limit 2')
|
|
|
+ result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 5')
|
|
|
url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
-
|
|
|
+ url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
|
|
|
# url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
|
|
|
|
# remove = db.query('select fid from review_process')
|
|
@@ -104,6 +100,7 @@ def parsing_js(resp, db_name):
|
|
|
|
|
|
if txt[6][11] != db_name:
|
|
|
return 0
|
|
|
+
|
|
|
output['name'] = txt[6][11]
|
|
|
output['adress_name'] = txt[6][18]
|
|
|
|
|
@@ -259,16 +256,13 @@ def time_parsing_js(time_json, output):
|
|
|
|
|
|
|
|
|
def save_js_to_db(jsobj, fid):
|
|
|
- global reviews_table
|
|
|
+ global shop_table
|
|
|
global iddict
|
|
|
- for r in jsobj:
|
|
|
- r['fid'] = fid
|
|
|
- key = '{}_{}'.format(r['fid'], r['author_id'])
|
|
|
- if iddict.get(key) is not None:
|
|
|
- continue
|
|
|
+
|
|
|
+ jsobj['fid'] = fid
|
|
|
+ if iddict.get(fid) is None:
|
|
|
try:
|
|
|
- r['review_image'] = str(r['review_image'])
|
|
|
- reviews_table.insert(r)
|
|
|
+ shop_table.insert(jsobj)
|
|
|
except:
|
|
|
traceback.print_exc()
|
|
|
|
|
@@ -286,8 +280,11 @@ def process_web_request_start(driver, db_name):
|
|
|
resp = brotli.decompress(request.response.body)
|
|
|
jstext = resp.decode('utf-8')
|
|
|
output = parsing_js(jstext, db_name)
|
|
|
- time.sleep(1)
|
|
|
- return output
|
|
|
+ if output == 0:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ time.sleep(1)
|
|
|
+ return output
|
|
|
return 0
|
|
|
|
|
|
|
|
@@ -365,7 +362,6 @@ def photos_parsing_js(resp, c):
|
|
|
return menu, all
|
|
|
|
|
|
|
|
|
-
|
|
|
def process_web_request_photo(driver, output):
|
|
|
try:
|
|
|
driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
|
|
@@ -420,7 +416,7 @@ def process_web_request_photo(driver, output):
|
|
|
def main():
|
|
|
global chrome_window
|
|
|
global store_list_table
|
|
|
- global reviews_table
|
|
|
+ global shop_table
|
|
|
global proxyport
|
|
|
global iddict
|
|
|
|
|
@@ -429,6 +425,7 @@ def main():
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
store_list_table = db['swire_store_list']
|
|
|
shop_table = db['shop_list4']
|
|
|
+ error_table = db['error_list2']
|
|
|
|
|
|
iddict=build_cache(db)
|
|
|
|
|
@@ -454,13 +451,14 @@ def main():
|
|
|
name = group['name']
|
|
|
num = group['num']
|
|
|
keyword = group['keyword']
|
|
|
+ fid = group['fid']
|
|
|
|
|
|
if name:
|
|
|
db_name = name
|
|
|
else:
|
|
|
db_name = num
|
|
|
|
|
|
- print(name, num, keyword, db_name)
|
|
|
+ print(fid, name, num, keyword, db_name)
|
|
|
print(item_url)
|
|
|
|
|
|
#shop_info
|
|
@@ -468,9 +466,14 @@ def main():
|
|
|
for i in range(5):
|
|
|
print('shop info try...{}'.format(i))
|
|
|
driver.get(item_url)
|
|
|
- time.sleep(2)
|
|
|
+ driver.refresh()
|
|
|
+ time.sleep(5)
|
|
|
|
|
|
- element = driver.find_element_by_id('searchbox-searchbutton')
|
|
|
+ wait = WebDriverWait(driver, 10)
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.ID, 'sb_cb50'))
|
|
|
+ )
|
|
|
+ element = driver.find_element_by_id('sb_cb50')
|
|
|
driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
time.sleep(5)
|
|
@@ -538,8 +541,13 @@ def main():
|
|
|
output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
|
|
|
|
|
|
- shop_table.insert(output,['item_url'])
|
|
|
+ # shop_table.insert(output,['item_url'])
|
|
|
+
|
|
|
+ save_js_to_db(output, fid)
|
|
|
+ error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
|
|
|
except:
|
|
|
+ error_table3 = db['error_list3']
|
|
|
+ error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
traceback.print_exc()
|
|
|
|
|
|
if __name__ == '__main__':
|