| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452 | # -*- coding: utf-8 -*-#from selenium import webdriverfrom seleniumwire import webdriverfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.common.by import Byimport seleniumimport tracebackfrom datetime import datetimeimport datasetimport timeimport jsonimport gzipimport reimport sys, osimport socketimport brotlifrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesimport urllib.parsefrom seleniumwire.utils import decode as sw_decode#chrome_window=Falsechrome_window=Trueglobalkw=Noneproxyport=8787def build_cache(db):    id_dict={}    cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;')    for c in cursor:        id_dict[c['place_id']]=1    return id_dict    #def brower_start(port):    global proxyport    global chrome_window    print(proxyport)    options = webdriver.ChromeOptions()    if chrome_window:        options.add_argument('--ignore-certificate-errors')        options.add_argument("--no-sandbox")        options.add_argument("--headless")        options.add_argument("--disable-gpu")        options.add_argument("--disable-dev-shm-usage")        browser = webdriver.Chrome(            options=options#            ,seleniumwire_options={'disable_encoding': True}#            desired_capabilities=options.to_capabilities()        )        browser.set_window_size(1400,1000)    else:        chrome_options = webdriver.ChromeOptions()        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here        chrome_options.add_argument('--ignore-certificate-errors')        chrome_options.add_argument("--no-sandbox")        chrome_options.add_argument("--disable-dev-shm-usage")        browser = webdriver.Remote(            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',            desired_capabilities=chrome_options.to_capabilities(),            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}            )#            seleniumwire_options = {'addr': '172.17.0.2','port':4444})        browser.set_window_size(1400,1000)    return browserdef page_down_(driver, xpath_css, time_):    e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')    result_count = e.text.split('-')[1].replace(' 項結果','')    print(result_count)    if int(result_count) > 5:        for i in range(time_):            e = driver.find_elements_by_css_selector('div[class="TFQHme"]')            action = webdriver.common.action_chains.ActionChains(driver)            action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)            action.click()            action.perform()            time.sleep(0.5)def keyin_keyword(driver, keyword):    print('key in keyword:' +keyword)    button = driver.find_element_by_id("searchbox")    driver.implicitly_wait(30)    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()    time.sleep(3)def scan_job(db,kw):    result={'kw':kw}    cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid  FROM google_poi.conv_log group by num  ) t2 where t1.id=t2.mid having diff>0 order by rand()')    for c in cursor:        result['num']=c['num']        break    cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')    for c in cursor:        result['lat']=c['lat']        result['lon']=c['lon']        result['loc']=c['loc']        return resultdef get_next_job(db,repeat=False,repkw=None,repnum=None):    global globalkw    result={}#    if globalkw is not None:#        cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"')#    else:#        cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')#    cursor = db.query('select kw,num  from areacodes where expand=0 order by rand()')    cursor = db.query('select kw,num  from areacodes order by rand()')    for c in cursor:#        repkw=c['kw']        if repkw is None:            repkw=c['kw']        result['kw']=c['kw']        result['num']=c['num']        break    if repkw is not None:        result['kw']=repkw    if result.get('num') is not None:        cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')        for c in cursor:            result['lat']=c['lat']            result['lon']=c['lon']            result['loc']=c['loc']            break    if repeat and repkw!= 'REP':        result['kw']=repkw        result['num']=repnum#    if 'REP' in repkw:#        if repnum=='REP':#            repnum=None#            cursor = db.query('select  num from swire_store_list  order by rand() limit 1')#            for c in cursor:#                repnum=c['num']#                break#        if repnum is None:#            cursor = db.query('select  num from swire_store_list  order by rand() limit 1')#            for c in cursor:#                repnum=c['num']#                break#        cursor = db.query('select  lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1')#        cursor = db.query('select  lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1')#        for c in cursor:#            result['kw']=c['keyword']#            result['num']=c['num']#            result['lat']=c['lat_txt']#            result['lon']=c['lon_txt']#            result['loc']=''#            return result        if repeat:#        cursor = db.query('select  lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')#        cursor = db.query('select  lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')        cursor = db.query('select  lat_txt,lon_txt,keyword,num from swire_store_list where keyword <> "火鍋餐廳" order by rand() limit 1')        for c in cursor:            result['kw']=c['keyword']            result['lat']=c['lat_txt']            result['lon']=c['lon_txt']            result['num']=c['num']            result['loc']=''    return resultdef write_to_file(jsobj,fname):    import codecs    fw=codecs.open(fname,'w','utf-8')    fw.write(str(jsobj))    fw.close()def parsing_js(orig):    resultobj=[]    content=""    lines=orig.split('\n')    for l in lines:        newl=l.replace('\\"','"')#        if '\\\\"' in newl:#            print(newl)#        newl=newl.repace('\\\\"','')        newl=newl.replace('\\"','"')        content+=newl    result=re.search(r'\[\["',content)    print(result)    content_begin=result.start()    result=re.search(r'\]\]"',content)    print(result)    content_end=result.end()    jscontent=content[content_begin:content_end-1]#    write_to_file(jscontent,'c:/tmp/debug.txt')#    write_to_file(jscontent,'c:/tmp/headless.txt')    jsobj=json.loads(jscontent)    for x in jsobj[0][1][1:]:        print(x[14][11])        print(x[14][9])        reviews_cnt=None        photo=None        rating=None        biz_id=None        loc_x=None        loc_y=None        addr_elmts=None        tel=None        try:            rating=x[14][4][7]            reviews_cnt=x[14][4][8]        except:            traceback.print_exc()        try:            photo=x[14][37][0][0][0]            num_photos=x[14][37][0][0][6][1]        except:            traceback.print_exc()        try:            loc_x=x[14][37][0][0][29][0]            loc_y=x[14][37][0][0][29][1]        except:            traceback.print_exc()        try:            biz_id=x[14][57][2]            tel=x[14][178][0][3]        except:            traceback.print_exc()        try:            addr_elmts=str(x[14][82])        except:            traceback.print_exc()        category=str(x[14][13])        topic=str(x[14][89])        print(x[14][13])        print(x[14][10])        print(x[14][2])        print(x[14][78])        try:            resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})        except:            traceback.print_exc()    return resultobjdef save_js_to_db(jsobj,num,keyword):    global store_list_table    global iddict    for r in jsobj:        if iddict.get(r['place_id']) is not None:            continue        r['num']=num        r['keyword']=keyword        try:            store_list_table.insert(r)#            store_list_table.upsert(r,keys=['place_id'])        except:            traceback.print_exc()#        store_list_table.upsert(r,keys=['place_id'])def process_web_request(db,driver,area_num,keyword):    global prev_cnt#    query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))    time.sleep(0.8)    time.sleep(3)    print("ppppppppp&**********************")    for request in driver.requests:        if 'search?' in request.url :            print('searching.....')#        else:#            print(request.url[20:60])        if request.response:#            if 'https://www.google.com.tw/search?tbm=map' in request.url :            if 'search?' in request.url :                print('parsing js:')#                resp=request.response.body#                resp = sw_decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))#                data = data.decode("utf8")#                print(request.response.header)#                sys.exit()#                driver.quit()                resp = request.response.body                print(request.response.headers.get('Content-Encoding'))                if 'gzip' in request.response.headers.get('Content-Encoding'):                    resp = gzip.decompress(request.response.body)                if 'br' in request.response.headers.get('Content-Encoding'):                    resp = brotli.decompress(request.response.body)                jstext=resp.decode('utf-8')                resultobj=parsing_js(jstext)                print("before",datetime.now())                print("num: "+str(area_num))                save_js_to_db(resultobj,area_num,keyword)                print("after",datetime.now())#                aft_cnt=0#                cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')#                for c in cursor:#                    aft_cnt=c['cnt']#                    break#                db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})    del driver.requests#    time.sleep(9999)def main():    global chrome_window    global store_list_table    global globalkw    global proxyport    global iddict    global prev_cnt    port=4444    # if len(sys.argv) == 3 :    #     port=int(sys.argv[1])    #     proxyport=int(sys.argv[2])    if len(sys.argv)>1:        globalkw=sys.argv[1]        port=int(sys.argv[2])        proxyport=int(sys.argv[3])    print(globalkw, port, proxyport)    failcnt=0    localip=socket.gethostbyname(socket.gethostname())#    if localip=='192.168.1.108':#        chrome_window=True#        chrome_window=False    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')    iddict=build_cache(db)    store_list_table = db['swire_store_list']#    table2 = db['swire_progress_list']    table2 = db['swire_area_progress']    if not chrome_window:        print('restart docker p{}'.format(port))#        os.system('sudo docker container restart p'+str(port))        os.system('docker container restart p'+str(port))        time.sleep(10)    print('drvier start...')    driver = brower_start(port)        area_num=None    repeating=False    while True:        try:            if len(sys.argv) > 4 :                repkw=sys.argv[1]                repnum=sys.argv[2]                if 'SCAN' in repkw:                    job=scan_job(db,repnum)                else:                    repeating=True                    job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum)            else:                job=get_next_job(db, repkw=globalkw)            print(job)            keyword  = job['kw']            latitude = job['lat'] #緯度            longitude = job['lon'] #精度            area_num=job['num']            safe_string = urllib.parse.quote_plus(keyword)            url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)#            prev_cnt=0#            cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')#            for c in cursor:#                prev_cnt=c['cnt']#                break#            url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)#            url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'#            print(url)#            url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'            driver.get(url)            time.sleep(3)            keyin_keyword(driver, keyword)            process_web_request(db,driver,area_num,keyword)            pagecnt=0            while True:                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')                if element.get_attribute('disabled'):                    break    #               driver.implicitly_wait(30)                ActionChains(driver).move_to_element(element).click(element).perform()                 process_web_request(db,driver,area_num,keyword)                if repeating:                    break                pagecnt+=1                if pagecnt>=5:                    break#            table2.upsert({'kw':keyword,'num':job['num']},['kw'])            table2.insert({'kw':keyword,'num':job['num']},['kw'])            db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')        except:            traceback.print_exc()            failcnt+=1            if failcnt>=15:                sys.exit()            passif __name__ == '__main__':    main()
 |