|
@@ -32,9 +32,6 @@ chrome_window=False
|
|
|
globalkw=None
|
|
|
proxyport=8787
|
|
|
|
|
|
-db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
|
|
|
- 'created_at', 'text', 'photos', 'store_review_time','store_review']
|
|
|
-
|
|
|
|
|
|
def write_to_file(jsobj,fname):
|
|
|
with open(fname, 'wb') as handle:
|
|
@@ -79,16 +76,16 @@ def brower_start(port):
|
|
|
|
|
|
def get_next_job(db):
|
|
|
result = {}
|
|
|
- result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 3')
|
|
|
+ result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 1')
|
|
|
url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
|
|
|
# url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
|
|
|
|
remove = db.query('select item_url from error_list3')
|
|
|
remove = pd.DataFrame([dict(i) for i in remove])
|
|
|
- remove_fid_list = remove['item_url'].to_list()
|
|
|
-
|
|
|
- url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
|
|
|
+ if len(remove) != 0:
|
|
|
+ remove_fid_list = remove['item_url'].to_list()
|
|
|
+ url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
|
|
|
|
|
|
return url_pd
|
|
|
|
|
@@ -265,7 +262,7 @@ def save_js_to_db(jsobj, fid):
|
|
|
|
|
|
|
|
|
def process_web_request_start(driver, fid):
|
|
|
- time.sleep(5)
|
|
|
+ time.sleep(3)
|
|
|
|
|
|
print("start&**********************")
|
|
|
for request in driver.requests:
|
|
@@ -273,9 +270,9 @@ def process_web_request_start(driver, fid):
|
|
|
# print(request.url)
|
|
|
if 'place?' in request.url :
|
|
|
print('parsing js:')
|
|
|
- print(request.url)
|
|
|
front, _ = fid.split(':')
|
|
|
if request.url.find(front) != -1:
|
|
|
+ print(request.url)
|
|
|
resp = brotli.decompress(request.response.body)
|
|
|
jstext = resp.decode('utf-8')
|
|
|
output = parsing_js(jstext)
|
|
@@ -285,9 +282,8 @@ def process_web_request_start(driver, fid):
|
|
|
|
|
|
|
|
|
def reviews_parsing_js(resp):
|
|
|
- columns_name = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
|
|
|
- 'review_time', 'review_content', 'review_image',
|
|
|
- 'rating', 'store_review_time','store_review']
|
|
|
+ columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
|
|
|
+ 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
|
|
|
jsobj = json.loads(resp[5::])
|
|
|
result = []
|
|
|
for i in range(len(jsobj[2])):
|
|
@@ -317,7 +313,6 @@ def reviews_parsing_js(resp):
|
|
|
|
|
|
|
|
|
def process_web_request_reviews(driver, output, start_js):
|
|
|
- time.sleep(0.8)
|
|
|
time.sleep(3)
|
|
|
|
|
|
print("reviews&**********************")
|
|
@@ -326,21 +321,25 @@ def process_web_request_reviews(driver, output, start_js):
|
|
|
# print(request.url)
|
|
|
if 'listentitiesreviews?' in request.url :
|
|
|
print('parsing js:')
|
|
|
- print(request.url)
|
|
|
if start_js.find(request.url.split('!')[-2]) != -1:
|
|
|
+ print(request.url)
|
|
|
resp = brotli.decompress(request.response.body)
|
|
|
jstext = resp.decode('utf-8')
|
|
|
result = reviews_parsing_js(jstext)
|
|
|
output['reviews'] = str(result)
|
|
|
time.sleep(1)
|
|
|
return output
|
|
|
+ return 0
|
|
|
|
|
|
|
|
|
def photos_parsing_js(resp):
|
|
|
def image_url_change_size(url):
|
|
|
- url_split = url.split('=')
|
|
|
- new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
|
|
|
- return new_url
|
|
|
+ if url.find('streetviewpixels') != -1:
|
|
|
+ return url
|
|
|
+ else:
|
|
|
+ url_split = url.split('=')
|
|
|
+ new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
|
|
|
+ return new_url
|
|
|
|
|
|
jsobj = json.loads(resp[5::])
|
|
|
# write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
|
|
@@ -388,7 +387,7 @@ def process_web_request_photo(driver, output, fid):
|
|
|
)
|
|
|
element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- time.sleep(2)
|
|
|
+ time.sleep(1)
|
|
|
|
|
|
print("photo&**********************")
|
|
|
menu_list = []
|
|
@@ -398,9 +397,9 @@ def process_web_request_photo(driver, output, fid):
|
|
|
# print(request.url)
|
|
|
if 'photo?' in request.url :
|
|
|
print('parsing js:')
|
|
|
- print(request.url)
|
|
|
front, _ = fid.split(':')
|
|
|
if request.url.find(front) != -1:
|
|
|
+ print(request.url)
|
|
|
resp = brotli.decompress(request.response.body)
|
|
|
jstext = resp.decode('utf-8')
|
|
|
menu, all = photos_parsing_js(jstext)
|
|
@@ -424,7 +423,7 @@ def main():
|
|
|
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
store_list_table = db['swire_store_list']
|
|
|
- shop_table = db['shop_list4']
|
|
|
+ shop_table = db['shop_list3']
|
|
|
error_table = db['error_list2']
|
|
|
|
|
|
iddict=build_cache(db)
|
|
@@ -460,13 +459,16 @@ def main():
|
|
|
|
|
|
print(fid, keyword, db_name)
|
|
|
print(item_url)
|
|
|
-
|
|
|
+ # ActionChains(driver).key_down(Keys.SHIFT).key_down(Keys.F5).perform()
|
|
|
+ # driver.find_element_by_tag_name('body').send_keys(Keys.F5)
|
|
|
+ # time.sleep(3)
|
|
|
+
|
|
|
#shop_info
|
|
|
print('parsing shop info....')
|
|
|
for i in range(5):
|
|
|
print('shop info try...{}'.format(i))
|
|
|
driver.get(item_url)
|
|
|
- time.sleep(5)
|
|
|
+ time.sleep(3)
|
|
|
|
|
|
wait = WebDriverWait(driver, 10)
|
|
|
wait.until(
|
|
@@ -475,7 +477,7 @@ def main():
|
|
|
element = driver.find_element_by_id('sb_cb50')
|
|
|
driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- time.sleep(5)
|
|
|
+ time.sleep(3)
|
|
|
driver.back()
|
|
|
|
|
|
if driver.current_url == item_url:continue
|
|
@@ -502,12 +504,17 @@ def main():
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
- output = process_web_request_reviews(driver, output, start_js)
|
|
|
- break
|
|
|
+ output_ = process_web_request_reviews(driver, output, start_js)
|
|
|
+ if output_ != 0:
|
|
|
+ output = output_
|
|
|
+ break
|
|
|
except:
|
|
|
driver.get(item_url)
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
+ if 'reviews' not in output.keys():
|
|
|
+ continue
|
|
|
+
|
|
|
# photo
|
|
|
print('parsing photo....')
|
|
|
if output['header_image'] != '':
|