noodles 3 yıl önce
ebeveyn
işleme
93864cb2fb
1 değiştirilmiş dosya ile 133 ekleme ve 78 silme
  1. 133 78
      gnews_keyword.py

+ 133 - 78
gnews_keyword.py

@@ -1,9 +1,11 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support.wait import WebDriverWait
 import time, pickle, sys, os, re, time, requests
 import dataset
+import traceback
 import pandas as pd
 from datetime import datetime, timedelta
 from newspaper import Article
@@ -60,79 +62,80 @@ def conv_time(t):
     return s
 
 
-def get_trends(q, url_table, id_cache, driver):
-    
-    driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
-    time.sleep(3)
-    # print(driver.page_source)
-    # click tool
-    # element = driver.find_element(By.ID, "hdtb-tls")
-    # driver.implicitly_wait(5)
-    # ActionChains(driver).move_to_element(element).click(element).perform()
-
-    # click time
-    # element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
-    # driver.implicitly_wait(5)
-    # ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
-
-    # click time
-    # element = driver.find_element(By.XPATH,"//div[@id='lb']")
-    # ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
-    # for e in ele:
-    #     if e.text == '過去 24 小時':
-    #         print(e.text)
-    #         driver.implicitly_wait(5)
-    #         ActionChains(driver).move_to_element(e).click(e).perform()
-    #         break
-
-    c = 0
-    while True:
-        time.sleep(3)
-        c += 1
-        logger_.info('page {}'.format(c))
-        print(driver.page_source)
-        elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
-        print(elmts)
-        for elmt in elmts:
-            title, url, company = '', '', ''
-            e = elmt.find_element_by_xpath(".//div[@role='heading']")
-            title = e.text
-            # print(title)
-
-            url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
-            # print(url)
-
-            company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
-            # print(company)
-
-            day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
-            day = conv_time(day)
-            # print(day)
-
-            print(title, url, company, day)
-            
-            if url not in id_cache:
-                url_table.insert({
-                    'title': title,
-                    'url': url,
-                    'keyword': q,
-                    'company': company,
-                    'day': str(day),
-                    'crawler_date': current,
-                    'page': c,
-                    '_status': 0
-                })
+def page_down_(driver, time_):
 
-            if c > 3: break
+    for i in range(time_):
         try:
-            element = driver.find_element_by_xpath("//a[@id='pnnext']")
-            driver.implicitly_wait(5)
-            ActionChains(driver).move_to_element(element).click(element).perform()
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.PAGE_DOWN).perform()
         except:
-            print('done')
-            break
-    logger_.info('{} news list update'.format(q))
-    return driver 
+            traceback.print_exc()
+        time.sleep(0.5)
+
+
+# def get_trends(q, url_table, id_cache, driver):
+    
+#     driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
+#     time.sleep(3)
+#     driver.refresh()
+#     c = 0
+#     while True:
+#         time.sleep(3)
+#         c += 1
+#         logger_.info('page {}'.format(c))
+#         print(driver.current_url)
+#         # print(driver.page_source)
+#         elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
+#         print(elmts)
+#         for elmt in elmts:
+#             title, url, company = '', '', ''
+#             e = elmt.find_element_by_xpath(".//div[@role='heading']")
+#             title = e.text
+#             # print(title)
+
+#             url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
+#             # print(url)
+
+#             company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
+#             # print(company)
+
+#             day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
+#             day = conv_time(day)
+#             # print(day)
+
+#             print(title, url, company, day)
+            
+#             if url not in id_cache:
+#                 url_table.insert({
+#                     'title': title,
+#                     'url': url,
+#                     'keyword': q,
+#                     'company': company,
+#                     'day': str(day),
+#                     'crawler_date': current,
+#                     'page': c,
+#                     '_status': 0
+#                 })
+
+#         if c > 3: break
+
+#         next_url = driver.current_url
+#         next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
+#         driver.get(next_url)
+#         print(next_url)
+#         # try:
+#         # page_down_(driver, 3)
+#         # next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
+#         # driver.get(next_url)
+#         # print(next_url)
+#         # driver.implicitly_wait(5)
+#             # ActionChains(driver).move_to_element(element).click(element).perform()
+#         # except:
+#         #     print('done')
+#         #     break
+#     logger_.info('{} news list update'.format(q))
+#     return driver 
+
 
 def our_rule(url, company, driver):
     url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
@@ -252,6 +255,54 @@ def get_next_job(db, table, query_key):
     return url_pd
 
 
+def get_trends(q, url_table, id_cache, driver, url):
+    
+    driver.get(url)
+    time.sleep(3)
+
+    print(driver.current_url)
+    # print(driver.page_source)
+    elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
+    print(elmts)
+    for elmt in elmts:
+        title, url, company = '', '', ''
+        e = elmt.find_element_by_xpath(".//div[@role='heading']")
+        title = e.text
+        # print(title)
+
+        url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
+        # print(url)
+
+        company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
+        # print(company)
+
+        day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
+        day = conv_time(day)
+        # print(day)
+
+        print(title, url, company, day)
+        
+        if url not in id_cache:
+            url_table.insert({
+                'title': title,
+                'url': url,
+                'keyword': q,
+                'company': company,
+                'day': str(day),
+                'crawler_date': current,
+                'page': c,
+                '_status': 0
+            })
+
+    # next_url = driver.current_url
+    # next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
+    # driver.get(next_url)
+    # print(next_url)
+    next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
+
+    logger_.info('{} news list update'.format(q))
+    return driver, next_url
+
 def main():
 
     if len(sys.argv) > 1 :
@@ -275,14 +326,18 @@ def main():
 
     # find new news url
     id_cache = build_cache(db, url_table_name)
-    driver = get_trends(query_key, url_table, id_cache, driver)
-    time.sleep(5)
-
-    url_pd = get_next_job(db, url_table_name, query_key)
-    logger_.info('find {} news...'.format(len(url_pd)))
-    
-    driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
-    logger_.info('{} news description update'.format(query_key))
+    url = "https://www.google.com/search?q={}&tbm=nws".format(query_key)
+    # url = "https://www.google.com"
+    for i in range(3):
+        logger_.info('page {}'.format(i+1))
+        driver, url = get_trends(query_key, url_table, id_cache, driver, url)
+        time.sleep(5)
+
+        url_pd = get_next_job(db, url_table_name, query_key)
+        logger_.info('find {} news...'.format(len(url_pd)))
+        
+        driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
+        logger_.info('{} news description update'.format(query_key))
 
     db.close()
     driver.close()