| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 | # -*- coding: utf-8 -*-#from selenium import webdriverfrom seleniumwire import webdriverfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.common.by import Byimport seleniumimport tracebackfrom bs4 import BeautifulSoupfrom utility import database_access as DAfrom utility.parseutils import *from utility.connect import *from datetime import datetimeimport pandas as pdimport datasetimport timeimport jsonimport reimport sys, osimport socketimport brotlichrome_window=Falsedef brower_start(port):    options = webdriver.ChromeOptions()    if chrome_window:        browser = webdriver.Chrome(            desired_capabilities=options.to_capabilities()        )    else:        browser = webdriver.Remote(            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',            desired_capabilities=options.to_capabilities()        )    return browserdef page_down_(driver, xpath_css, time_):    e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')    result_count = e.text.split('-')[1].replace(' 項結果','')    print(result_count)    if int(result_count) > 5:        for i in range(time_):            e = driver.find_elements_by_css_selector('div[class="TFQHme"]')            action = webdriver.common.action_chains.ActionChains(driver)            action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)            action.click()            action.perform()            time.sleep(0.5)    # elmts = driver.find_elements_by_xpath(xpath_css)    # print(elmts)    # if len(elmts)>1:    #     elmt=elmts[1]    # else:    #     elmt=elmts[0]    # actions = ActionChains(driver)    # actions.move_to_element(elmt).click().perform()    # for i in range(time_):    #     try:    #         actions = ActionChains(driver)    #         actions.send_keys(Keys.PAGE_DOWN).perform()    #     except:    #         traceback.print_exc()    #     time.sleep(0.5)def get_url_list(driver):    # for i in range(5, 43, 2):    #     try:    #         wait = WebDriverWait(driver, 60)    #         wait.until(    #             EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))    #         )    #         driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)    #         time.sleep(0.5)    #     except:    #         pass    # wait = WebDriverWait(driver, 30)    # try:地圖移動時更新結果    #     wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))    # except selenium.common.exceptions.TimeoutException:    #     traceback.print_exc()    #     return "EMPTY"    page_down_(driver, '//div[@class="TFQHme"]', 8)    url_soup = BeautifulSoup(driver.page_source, 'html.parser')    url_list = []    for i in url_soup.find_all('a'):        try:            if i['href'].find('maps/place') != -1:                url_list += [[i['href'], i['aria-label']]]        except:            pass    # print(len(url_list))    return url_listdef keyin_keyword(driver, keyword):    button = driver.find_element_by_id("searchbox")    driver.implicitly_wait(30)    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()    time.sleep(3)# def get_crawler_list(db):    #     result = db.query('select keyword, count(*) from shop_item_list group by keyword')#     result = pd.DataFrame([i for i in result])#     result.columns = ['keyword', 'count']#     result = result[result['count'] < 100]#     keyword = result.sample(1).iloc[0]['keyword']    #     num=0#     cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')#     for c in cursor:#         num=c['num']#         break#     cursor=db.query('select * from lat_lon_loc where num >= '+str(num))#     #    cursor=db.query('select * from lat_lon_loc')#     lst=[]#     for c in cursor:#         lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})        #     return keyword, lst    def get_crawler_list(db):#    result = db.query('select * from shop_item_list order by keyword')#    result = pd.DataFrame([i for i in result])#    result = result[~result.keyword.str.contains('項')]#    progress = db.query('select distinct(kw) from progress_list2 where num < 367')#    progress = pd.DataFrame([i for i in progress])#    if len(progress) != 0:#        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']#    else:#        keyword = result.iloc[0]['keyword']#        #    return keyword    return '滷味'    cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')    for c in cursor:        return c['kw']    return Nonedef get_lon_lat_list(db, keyword):    num=0    cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')    for c in cursor:        num=c['num']        break    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))    lst=[]    for c in cursor:        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})    return lstdef parsing_js(orig):    content=""    lines=orig.split('\n')    for l in lines:        newl=l.replace('\\"','"')        content+=newl    result=re.search(r'\[\["',content)    content_begin=result.start()    result=re.search(r'\]\]"',content)    content_end=result.end()    jscontent=content[content_begin:content_end-1]    jsobj=json.loads(jscontent)    print()    for x in jsobj[0][1][1:]:        print(x[14][11])        print(x[14][10])        print(x[14][2])        print(x[14][78])def main():    global chrome_window    localip=socket.gethostbyname(socket.gethostname())    if localip=='192.168.1.108':        chrome_window=True    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')    table = db['shop_item_list3']    table2 = db['progress_list2']    port=4447    if len(sys.argv) > 1 :        port=int(sys.argv[1])        print('restart docker p{}'.format(port))        os.system('sudo docker container restart p'+str(port))        time.sleep(8)    print('drvier start...')    driver = brower_start(port)        for i in range(10):        try:            keyword  = get_crawler_list(db)            print(keyword)            lst = get_lon_lat_list(db, keyword)#            print(lst)            print(keyword, len(lst))            for r in lst:                latitude = r['lat'] #緯度                longitude = r['lon'] #精度                area_num=r['num']                table2.upsert({'kw':keyword,'num':r['num']},['kw'])                url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)                driver.get(url)                keyin_keyword(driver, keyword)                failcnt = 0                time.sleep(10)                for request in driver.requests:                    if request.response:                        if 'https://www.google.com.tw/search?tbm=map' in request.url :                            print(                                request.url,                                request.response.status_code,                                request.response.headers['Content-Type']                            )                            print('parsing js:')                            resp = brotli.decompress(request.response.body)                            jstext=resp.decode('utf-8')                            parsing_js(jstext)#                            import codecs#                            fw=codecs.open('c:/tmp/ot.json','w','utf-8')#                            fw.write(jstext)#                            fw.close()#                            print(jstext)#                            time.sleep(9999)#                            jsobj=json.loads(jstext)#                            print(jsobj)#                sys.exit()                for page in range(10):                    print(keyword, latitude, longitude, page)                    url_list = get_url_list(driver)                    duplicate = 0                    # shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']                    for item in url_list:                        try:                            table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \                                          'keyword':keyword, 'item_url':item[0],'area_num':area_num,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})                        except:                            duplicate += 1                    print(len(url_list), duplicate)        #                     result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]        #                     insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\        #                                     .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))        #                     DA.mysql_insert_data(db, insert_sql)                    if page < 2 :                        element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')                        if element.get_attribute('disabled'):                            break                        driver.implicitly_wait(30)                        ActionChains(driver).move_to_element(element).click(element).perform()         except:            passif __name__ == '__main__':    main()
 |