|
@@ -7,6 +7,7 @@ from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
from selenium.webdriver.common.by import By
|
|
|
import selenium
|
|
|
+import gzip
|
|
|
import traceback
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
@@ -35,13 +36,10 @@ db_columns = ['author_id','author_page','author_name', 'author_image', 'author_r
|
|
|
'review_time', 'review_content', 'review_image',
|
|
|
'store_review_time','store_review']
|
|
|
|
|
|
+
|
|
|
def write_to_file(jsobj,fname):
|
|
|
with open(fname, 'wb') as handle:
|
|
|
pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
- # import codecs
|
|
|
- # fw=codecs.open(fname,'w','utf-8')
|
|
|
- # fw.write(str(jsobj))
|
|
|
- # fw.close()
|
|
|
|
|
|
|
|
|
def build_cache(db):
|
|
@@ -82,15 +80,13 @@ def brower_start(port):
|
|
|
|
|
|
def get_next_job(db):
|
|
|
result = {}
|
|
|
- result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')
|
|
|
+ sql = '''select t1.name, t1.ludocid, t1.fid, t1.user_ratings_total, t2.place_id from
|
|
|
+ (select * from shop_list3 where ludocid is NOT NULL and user_ratings_total is NOT NULL and
|
|
|
+ fid not in (select fid from review_process ) ORDER BY RAND() limit 1 )
|
|
|
+ as t1 join google_poi.swire_store_list as t2 on t1.fid = t2.fid'''
|
|
|
+ result = db.query(sql)
|
|
|
url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
- url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
|
-
|
|
|
- remove = db.query('select fid from review_process')
|
|
|
- remove = pd.DataFrame([dict(i) for i in remove])
|
|
|
- remove_fid_list = remove['fid'].to_list()
|
|
|
-
|
|
|
- url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
|
|
|
+ url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
|
|
|
|
|
|
return url_pd
|
|
|
|
|
@@ -141,22 +137,32 @@ def save_js_to_db(jsobj, fid):
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
|
-def process_web_request(db, driver, fid):
|
|
|
- time.sleep(0.8)
|
|
|
+def process_web_request(driver, fid, ludocid):
|
|
|
time.sleep(3)
|
|
|
print("ppppppppp&**********************")
|
|
|
for request in driver.requests:
|
|
|
if request.response:
|
|
|
# print(request.url)
|
|
|
if 'listentitiesreviews?' in request.url :
|
|
|
- print('parsing js:')
|
|
|
- print(request.url)
|
|
|
- resp = brotli.decompress(request.response.body)
|
|
|
- jstext = resp.decode('utf-8')
|
|
|
- result = parsing_js(jstext)
|
|
|
+ if request.url.find(ludocid) != -1:
|
|
|
+
|
|
|
+ print('parsing js:')
|
|
|
+ print(request.url)
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+
|
|
|
+ if 'gzip' in request.response.headers.get('Content-Encoding'):
|
|
|
+ resp = gzip.decompress(request.response.body)
|
|
|
|
|
|
- save_js_to_db(result, fid)
|
|
|
- time.sleep(1)
|
|
|
+ if 'br' in request.response.headers.get('Content-Encoding'):
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ result = parsing_js(jstext)
|
|
|
+
|
|
|
+ save_js_to_db(result, fid)
|
|
|
+ time.sleep(1)
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
|
|
|
|
|
|
def page_down_(driver, xpath_css, time_):
|
|
@@ -200,7 +206,7 @@ def main():
|
|
|
global proxyport
|
|
|
global iddict
|
|
|
|
|
|
- localip=socket.gethostbyname(socket.gethostname())
|
|
|
+ # localip=socket.gethostbyname(socket.gethostname())
|
|
|
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
store_list_table = db['swire_store_list']
|
|
@@ -229,20 +235,25 @@ def main():
|
|
|
item_url = group['item_url']
|
|
|
reviews_cnt = group['reviews_cnt']
|
|
|
fid = group['fid']
|
|
|
+ ludocid = group['ludocid']
|
|
|
|
|
|
print(reviews_cnt, item_url)
|
|
|
- driver.get(item_url)
|
|
|
- time.sleep(0.5)
|
|
|
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
- tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})
|
|
|
|
|
|
- if tmp_value:
|
|
|
+ for i in range(3):
|
|
|
+ print('reviews try...{}'.format(i))
|
|
|
+ print("reviews try.....{}".format(datetime.now()))
|
|
|
+
|
|
|
+ driver.get(item_url)
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
get_reviews(driver, reviews_cnt)
|
|
|
- process_web_request(db, driver, fid)
|
|
|
+ status = process_web_request(driver, fid, ludocid)
|
|
|
print(driver.current_url)
|
|
|
|
|
|
- db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
|
|
|
-
|
|
|
+ if status:
|
|
|
+ db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
|
|
|
+ break
|
|
|
+
|
|
|
except:
|
|
|
traceback.print_exc()
|
|
|
|