Wunoodles há 3 anos atrás
pai
commit
8ec8250724
3 ficheiros alterados com 158 adições e 95 exclusões
  1. 150 88
      run.py
  2. 6 5
      utility/googlemapsearch.sql
  3. 2 2
      utility/parseutils.py

+ 150 - 88
run.py

@@ -75,9 +75,9 @@ def open_time(driver):
 
 
 def get_shop_info(driver, output, shop_soup):
-    current_url_split = driver.current_url.split('@')[1].split(',')
-    output['lon'] = current_url_split[1]
-    output['lat'] = current_url_split[0]
+    # current_url_split = driver.current_url.split('@')[1].split(',')
+    # output['lon'] = current_url_split[1]
+    # output['lat'] = current_url_split[0]
     
     location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
     output['city'] = location[-1]
@@ -420,16 +420,30 @@ def time_click(driver):
         return ''
 
 
+def get_new_keyword(db):
+    result = db.query('select distinct(keyword) from shop_item_list order by keyword')
+    result = pd.DataFrame([i for i in result])
+    progress = db.query('select distinct(kw) from progress_list2')
+    progress = pd.DataFrame([i for i in progress])
+
+    if len(progress) != 0:
+        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
+    else:
+        keyword = result.iloc[0].values[0]
+        
+    return keyword
+
+
 def get_not_cralwer_url(keyword):
     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
-    table = db['shop_item_list']
+    table = db['shop_item_list3']
     url_list = list(table.find(keyword=keyword))
-    shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
-    error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
+    shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
+    error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
     
     url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
-    url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
-    url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
+    # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
+    # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
     url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
     url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
 
@@ -464,6 +478,44 @@ def serive_create_linux(profilepath):
     return driver
 
 
+def find_lon_lat():
+    e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
+    size = e.size
+    total_height = size['height']
+    total_width = size['width']
+
+    size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
+    left_width = size2['width']
+
+    x = (total_width - left_width) / 2 + left_width
+    y = total_height / 2
+
+    action = webdriver.common.action_chains.ActionChains(driver)
+    action.move_to_element_with_offset(e, x, y)
+    action.context_click()
+    action.perform()
+
+    time.sleep(0.5)
+
+    element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
+    lat, lon = element.text.split(',')
+    return float(lat), float(lon)
+
+
+def get_unique_id():
+    element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
+    driver.implicitly_wait(5)
+    ActionChains(driver).move_to_element(element).click(element).perform()
+    ele = driver.find_element(By.CSS_SELECTOR, "input")
+    short_url = ele.get_attribute('value')
+    unique_id = short_url.split('/')[-1]
+
+    element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
+    driver.implicitly_wait(5)
+    ActionChains(driver).move_to_element(element).click(element).perform()
+    return unique_id
+
+
 def page_down_(driver, xpath_css, time_):
     elmts = driver.find_elements_by_xpath(xpath_css)
     print(elmts)
@@ -484,89 +536,99 @@ def page_down_(driver, xpath_css, time_):
 
 def main():
     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
-    
-    keyword = '麻辣火鍋'
-    if len(sys.argv) >1:
-        keyword=sys.argv[1]
-    port=4444
-    if len(sys.argv) >2:
-        port=int(sys.argv[2])
-
-    url_pd = get_not_cralwer_url(keyword)
-    print('drvier start {}...'.format(keyword))
-    driver = brower_start(port)
-    #driver = serive_create('Profile 6')
-    #profilepath = 'Profile 1'
-    #driver = serive_create_linux(profilepath)
-    
-    for key, row in url_pd.iterrows():
-        try:    
-            name = row['name']
-            item_url = row['item_url']
-            print(key, name, ': ' ,item_url)
-
-            print('start...')
-            driver.get(item_url)
-            page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
-            
-            time_status = time_click(driver)
-            if time_status == 'error':
-                error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
-                data_select_insert(db, 'error_list', error_table_col, row)
-                continue
-            time.sleep(0.5)
-            shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
-
-            output = {
-                'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
-            }
-            print(output['name'])
-
-            print('get_shop_info')
-            output = get_shop_info(driver, output, shop_soup)
-
-            print('get_intro_info')
-            if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
-                output = get_intro_info(driver, output)
-            else:
-                for key in intro_list:
-                    output[intro_list[key][0]] = '[]'
+    db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    table2 = db2['progress_list2']
+    # keyword = '麻辣火鍋'
+    # if len(sys.argv) >1:
+    #     keyword=sys.argv[1]
+    # port=4444
+    # if len(sys.argv) >2:
+    #     port=int(sys.argv[2])
+    if len(sys.argv) > 1 :
+        port=int(sys.argv[1])
+        print('restart docker p{}'.format(port))
+        os.system('sudo docker container restart pp'+str(port))
+        time.sleep(8)
+
+    for i in range(5):
+        keyword = get_new_keyword(db2)
+        url_pd = get_not_cralwer_url(keyword)
+        print('drvier start {}...'.format(keyword))
+        driver = brower_start(port)
+        #driver = serive_create('Profile 6')
+        #profilepath = 'Profile 1'
+        #driver = serive_create_linux(profilepath)
+        
+        for key, row in url_pd.iterrows():
+            try:    
+                name = row['name']
+                item_url = row['item_url']
+                print(key, name, ': ' ,item_url)
+
+                print('start...')
+                driver.get(item_url)
+                page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
+                lat, lon = find_lon_lat()
+                unique_id = get_unique_id()
+                time_status = time_click(driver)
+                if time_status == 'error':
+                    error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
+                    data_select_insert(db, 'error_list2', error_table_col, row)
+                    continue
+                time.sleep(0.5)
+                shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+
+                output = {
+                    'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
+                    'lon': lon,
+                    'lat': lat,
+                    'unique_id': unique_id
+                }
+                print(output['name'], lon, lat)
+
+                print('get_shop_info')
+                output = get_shop_info(driver, output, shop_soup)
+
+                print('get_intro_info')
+                if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
+                    output = get_intro_info(driver, output)
+                else:
+                    for key in intro_list:
+                        output[intro_list[key][0]] = '[]'
 
-            print('get_time_list')
-            if time_status == '正常':
-                output = get_time_list(shop_soup, output)
-            else:
-                output['open_now'] = False
-                output['periods'] = ''
-                output['weekday_text'] = ''
+                print('get_time_list')
+                if time_status == '正常':
+                    output = get_time_list(shop_soup, output)
+                else:
+                    output['open_now'] = False
+                    output['periods'] = ''
+                    output['weekday_text'] = ''
 
-            print('user_ratings_total')
-            if output['user_ratings_total'] == '':
-                output['reviews'] = ''
-            else:
-                output = get_reviews(driver, output)
-
-            print('find_big_photo')
-            output = find_big_photo(output, driver)
-
-            output_name = output['name'].replace('(','').replace(')', '')
-            query_name = '{}+{}'.format(output_name, output['addr'])
-            query_name = query_name.replace(' ','')
-            output['item_url'] = item_url
-            output['keyword'] = keyword
-            output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
-            data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
-            
-        except Exception as e:
-            print(e)
-            error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
-            data_select_insert(db, 'error_list', error_table_col, row)
-            time.sleep(1)
-            # driver.close()
-            # driver = brower_start(port)
-            # driver = serive_create_linux(profilepath)
-   
+                print('user_ratings_total')
+                if output['user_ratings_total'] == '':
+                    output['reviews'] = ''
+                else:
+                    output = get_reviews(driver, output)
+
+                print('find_big_photo')
+                output = find_big_photo(output, driver)
+
+                output_name = output['name'].replace('(','').replace(')', '')
+                query_name = '{}+{}'.format(output_name, output['addr'])
+                query_name = query_name.replace(' ','')
+                output['item_url'] = item_url
+                output['keyword'] = keyword
+                output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
+                data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
+                
+            except Exception as e:
+                print(e)
+                error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
+                data_select_insert(db, 'error_list2', error_table_col, row)
+                time.sleep(1)
 
+        table2.insert({'kw':keyword,'num':key})
+ 
 
 if __name__ == '__main__':
     main()

+ 6 - 5
utility/googlemapsearch.sql

@@ -1,5 +1,6 @@
-CREATE TABLE `shop_list` (
+CREATE TABLE `shop_list2` (
    `id` int NOT NULL AUTO_INCREMENT,
+   `unique_id` VARCHAR(20),
    `name` VARCHAR(100),
    `lon` DOUBLE,
    `lat` DOUBLE,
@@ -33,12 +34,12 @@ CREATE TABLE `shop_list` (
    `shop_photo` JSON,
    `menu_photo` JSON,
 
-   `google_url` VARCHAR(200),
-   `item_url` VARCHAR(200),
+   `google_url` text,
+   `item_url` text,
    `keyword` VARCHAR(20),
    `crawler_date` char(20) NOT NULL,
    PRIMARY KEY (`id`),
-   UNIQUE KEY (`item_url`)
+   UNIQUE KEY (`unique_id`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 
 
@@ -55,7 +56,7 @@ CREATE TABLE `shop_item_list` (
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 
 
-CREATE TABLE `error_list` (
+CREATE TABLE `error_list2` (
    `id` int NOT NULL AUTO_INCREMENT,
    `name` VARCHAR(100),
    `lon` DOUBLE,

+ 2 - 2
utility/parseutils.py

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 DB_NAME = 'google_poi'
-SHOP_LIST_TABLE = 'shop_list'
-SHOP_LIST_TABLE_COL = ['name', 'lon', 'lat', 'city', 'area', 
+SHOP_LIST_TABLE = 'shop_list2'
+SHOP_LIST_TABLE_COL = ['unique_id', 'name', 'lon', 'lat', 'city', 'area', 
                       'rating', 'user_ratings_total', 'category', 'price_level', 
                       'addr', 'tel', 'services', 'products', 'choices', 
                       'facilities', 'groups', 'plans', 'payments', 'safeties',