| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 | # -*- coding: utf-8 -*-#from selenium import webdriverfrom seleniumwire import webdriverfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.common.by import Byimport seleniumimport tracebackfrom bs4 import BeautifulSoupfrom utility import database_access as DAfrom utility.parseutils import *from utility.connect import *from datetime import datetimefrom requests import sessionimport pandas as pdimport datasetimport timeimport jsonimport reimport sys, osimport socketimport brotliimport picklefrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesimport urllib.parsechrome_window=Falseglobalkw=Noneproxyport=8787db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',              'review_time', 'review_content', 'review_image',              'store_review_time','store_review']def write_to_file(jsobj,fname):    with open(fname, 'wb') as handle:        pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)    # import codecs    # fw=codecs.open(fname,'w','utf-8')    # fw.write(str(jsobj))    # fw.close()def build_cache(db):    global reviews_table    id_dict={}    cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')    for c in cursor:        key = '{}_{}'.format(c['fid'],c['author_id'])        id_dict[key]=1    return id_dictdef brower_start(port):    global proxyport    global chrome_window    print(proxyport)    options = webdriver.ChromeOptions()    if chrome_window:        browser = webdriver.Chrome(            desired_capabilities=options.to_capabilities()        )    else:        chrome_options = webdriver.ChromeOptions()        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here        chrome_options.add_argument('--ignore-certificate-errors')        chrome_options.add_argument("--no-sandbox")        chrome_options.add_argument("--disable-dev-shm-usage")        browser = webdriver.Remote(            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',            desired_capabilities=chrome_options.to_capabilities(),            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}            )        browser.set_window_size(1400,1000)    return browserdef get_next_job(db):    result = {}    result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')    url_pd = pd.DataFrame([dict(i) for i in result])    url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))    remove = db.query('select fid from review_process')    remove = pd.DataFrame([dict(i) for i in remove])    remove_fid_list = remove['fid'].to_list()    url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]    return url_pddef parsing_js(resp):    jsobj = json.loads(resp[5::])    result = []    for i in range(len(jsobj[2])):        tmp = []        tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]        tmp += [jsobj[2][i][1], jsobj[2][i][3]]        # image        image = []        if jsobj[2][i][14]:            for j in range(len(jsobj[2][i][14])):                image += [jsobj[2][i][14][j][6][0]]        tmp += [image]        # store reply        if jsobj[2][i][9]:            tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]        else:            tmp += ['', '']        tmp_dict = {}        for i in range(len(db_columns)):            tmp_dict[db_columns[i]] = tmp[i]        tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")        result.append(tmp_dict)    # write_to_file(orig,'debug.pickle')    return result def save_js_to_db(jsobj, fid):    global reviews_table    global iddict    for r in jsobj:        r['fid'] = fid        key = '{}_{}'.format(r['fid'], r['author_id'])        if iddict.get(key) is not None:            continue        try:            r['review_image'] = str(r['review_image'])            reviews_table.insert(r)        except:            traceback.print_exc()def process_web_request(db, driver, fid):    time.sleep(0.8)    time.sleep(3)    print("ppppppppp&**********************")    for request in driver.requests:        if request.response:            # print(request.url)            if 'listentitiesreviews?' in request.url :                print('parsing js:')                print(request.url)                resp = brotli.decompress(request.response.body)                jstext = resp.decode('utf-8')                result = parsing_js(jstext)                save_js_to_db(result, fid)                time.sleep(1)def page_down_(driver, xpath_css, time_):    elmts = driver.find_elements_by_xpath(xpath_css)    print(elmts)    if len(elmts)>1:        elmt=elmts[1]    else:        elmt=elmts[0]    actions = ActionChains(driver)    actions.move_to_element(elmt).click().perform()    for i in range(time_):        try:            actions = ActionChains(driver)            actions.send_keys(Keys.PAGE_DOWN).perform()        except:            traceback.print_exc()        time.sleep(0.5)def get_reviews(driver, reviews_cnt):    wait = WebDriverWait(driver, 30)    more_reviews_css = "button[jsaction='pane.rating.moreReviews']"    wait.until(        EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))    )    element = driver.find_element_by_css_selector(more_reviews_css)    driver.implicitly_wait(10)    ActionChains(driver).move_to_element(element).click(element).perform()    time.sleep(0.5)    reviews_cnt = int(reviews_cnt)    if reviews_cnt > 10:        page_down_count = int(reviews_cnt) // 3        page_down_(driver, '//div[@class="PPCwl"]', page_down_count)def main():    global chrome_window    global store_list_table    global reviews_table    global proxyport    global iddict    localip=socket.gethostbyname(socket.gethostname())    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')    store_list_table = db['swire_store_list']    reviews_table = db['reviews_table']    iddict=build_cache(db)        port=4444    if len(sys.argv) == 3 :        port=int(sys.argv[1])        proxyport=int(sys.argv[2])    if not chrome_window:        print('restart docker pw{}'.format(port))#        os.system('sudo docker container restart p'+str(port))        os.system('sudo docker container restart pw'+str(port))        time.sleep(10)    print('drvier start...')    driver = brower_start(port)    job = get_next_job(db)    for row, group in job.iterrows():        try:            item_url = group['item_url']            reviews_cnt = group['reviews_cnt']            fid = group['fid']            print(reviews_cnt, item_url)            driver.get(item_url)            time.sleep(0.5)            shop_soup = BeautifulSoup(driver.page_source, 'html.parser')            tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})            if tmp_value:                get_reviews(driver, reviews_cnt)                process_web_request(db, driver, fid)                print(driver.current_url)            db['review_process'].insert({'fid':fid, 'dt':datetime.now()})                    except:            traceback.print_exc()if __name__ == '__main__':    main()
 |