@@ -7,6 +7,7 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
import selenium
+import gzip
import traceback
from bs4 import BeautifulSoup
@@ -35,13 +36,10 @@ db_columns = ['author_id','author_page','author_name', 'author_image', 'author_r
'review_time', 'review_content', 'review_image',
def write_to_file(jsobj,fname):
with open(fname, 'wb') as handle:
pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
- # import codecs
- # fw=codecs.open(fname,'w','utf-8')
- # fw.write(str(jsobj))
- # fw.close()
def build_cache(db):
@@ -82,15 +80,13 @@ def brower_start(port):
def get_next_job(db):
result = {}
- result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')
+ sql = '''select t1.name, t1.ludocid, t1.fid, t1.user_ratings_total, t2.place_id from
+ (select * from shop_list3 where ludocid is NOT NULL and user_ratings_total is NOT NULL and
+ fid not in (select fid from review_process ) ORDER BY RAND() limit 1 )
+ as t1 join google_poi.swire_store_list as t2 on t1.fid = t2.fid'''
+ result = db.query(sql)
url_pd = pd.DataFrame([dict(i) for i in result])
- url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
- remove = db.query('select fid from review_process')
- remove = pd.DataFrame([dict(i) for i in remove])
- remove_fid_list = remove['fid'].to_list()
- url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
+ url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
return url_pd
@@ -141,22 +137,32 @@ def save_js_to_db(jsobj, fid):
-def process_web_request(db, driver, fid):
- time.sleep(0.8)
+def process_web_request(driver, fid, ludocid):
for request in driver.requests:
if request.response:
# print(request.url)
if 'listentitiesreviews?' in request.url :
- print('parsing js:')
- print(request.url)
- resp = brotli.decompress(request.response.body)
- jstext = resp.decode('utf-8')
- result = parsing_js(jstext)
+ if request.url.find(ludocid) != -1:
+ print('parsing js:')
+ print(request.url)
+ resp = brotli.decompress(request.response.body)
+ if 'gzip' in request.response.headers.get('Content-Encoding'):
+ resp = gzip.decompress(request.response.body)
- save_js_to_db(result, fid)
- time.sleep(1)
+ if 'br' in request.response.headers.get('Content-Encoding'):
+ resp = brotli.decompress(request.response.body)
+ jstext = resp.decode('utf-8')
+ result = parsing_js(jstext)
+ save_js_to_db(result, fid)
+ time.sleep(1)
+ return 1
+ return 0
def page_down_(driver, xpath_css, time_):
@@ -200,7 +206,7 @@ def main():
global proxyport
global iddict
- localip=socket.gethostbyname(socket.gethostname())
+ # localip=socket.gethostbyname(socket.gethostname())
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
store_list_table = db['swire_store_list']
@@ -229,20 +235,25 @@ def main():
item_url = group['item_url']
reviews_cnt = group['reviews_cnt']
fid = group['fid']
+ ludocid = group['ludocid']
print(reviews_cnt, item_url)
- driver.get(item_url)
- time.sleep(0.5)
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
- tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})
- if tmp_value:
+ for i in range(3):
+ print('reviews try...{}'.format(i))
+ print("reviews try.....{}".format(datetime.now()))
+ driver.get(item_url)
+ time.sleep(0.5)
get_reviews(driver, reviews_cnt)
- process_web_request(db, driver, fid)
+ status = process_web_request(driver, fid, ludocid)
- db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
+ if status:
+ db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
+ break