noodles 3 年之前
父節點
當前提交
93864cb2fb
共有 1 個文件被更改,包括 133 次插入78 次删除
  1. 133 78
      gnews_keyword.py

+ 133 - 78
gnews_keyword.py

@@ -1,9 +1,11 @@
 from selenium import webdriver
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support.wait import WebDriverWait
 import time, pickle, sys, os, re, time, requests
 import time, pickle, sys, os, re, time, requests
 import dataset
 import dataset
+import traceback
 import pandas as pd
 import pandas as pd
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
 from newspaper import Article
 from newspaper import Article
@@ -60,79 +62,80 @@ def conv_time(t):
     return s
     return s
 
 
 
 
-def get_trends(q, url_table, id_cache, driver):
-    
-    driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
-    time.sleep(3)
-    # print(driver.page_source)
-    # click tool
-    # element = driver.find_element(By.ID, "hdtb-tls")
-    # driver.implicitly_wait(5)
-    # ActionChains(driver).move_to_element(element).click(element).perform()
-
-    # click time
-    # element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
-    # driver.implicitly_wait(5)
-    # ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
-
-    # click time
-    # element = driver.find_element(By.XPATH,"//div[@id='lb']")
-    # ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
-    # for e in ele:
-    #     if e.text == '過去 24 小時':
-    #         print(e.text)
-    #         driver.implicitly_wait(5)
-    #         ActionChains(driver).move_to_element(e).click(e).perform()
-    #         break
-
-    c = 0
-    while True:
-        time.sleep(3)
-        c += 1
-        logger_.info('page {}'.format(c))
-        print(driver.page_source)
-        elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
-        print(elmts)
-        for elmt in elmts:
-            title, url, company = '', '', ''
-            e = elmt.find_element_by_xpath(".//div[@role='heading']")
-            title = e.text
-            # print(title)
-
-            url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
-            # print(url)
-
-            company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
-            # print(company)
-
-            day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
-            day = conv_time(day)
-            # print(day)
-
-            print(title, url, company, day)
-            
-            if url not in id_cache:
-                url_table.insert({
-                    'title': title,
-                    'url': url,
-                    'keyword': q,
-                    'company': company,
-                    'day': str(day),
-                    'crawler_date': current,
-                    'page': c,
-                    '_status': 0
-                })
+def page_down_(driver, time_):
 
 
-            if c > 3: break
+    for i in range(time_):
         try:
         try:
-            element = driver.find_element_by_xpath("//a[@id='pnnext']")
-            driver.implicitly_wait(5)
-            ActionChains(driver).move_to_element(element).click(element).perform()
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.PAGE_DOWN).perform()
         except:
         except:
-            print('done')
-            break
-    logger_.info('{} news list update'.format(q))
-    return driver 
+            traceback.print_exc()
+        time.sleep(0.5)
+
+
+# def get_trends(q, url_table, id_cache, driver):
+    
+#     driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
+#     time.sleep(3)
+#     driver.refresh()
+#     c = 0
+#     while True:
+#         time.sleep(3)
+#         c += 1
+#         logger_.info('page {}'.format(c))
+#         print(driver.current_url)
+#         # print(driver.page_source)
+#         elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
+#         print(elmts)
+#         for elmt in elmts:
+#             title, url, company = '', '', ''
+#             e = elmt.find_element_by_xpath(".//div[@role='heading']")
+#             title = e.text
+#             # print(title)
+
+#             url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
+#             # print(url)
+
+#             company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
+#             # print(company)
+
+#             day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
+#             day = conv_time(day)
+#             # print(day)
+
+#             print(title, url, company, day)
+            
+#             if url not in id_cache:
+#                 url_table.insert({
+#                     'title': title,
+#                     'url': url,
+#                     'keyword': q,
+#                     'company': company,
+#                     'day': str(day),
+#                     'crawler_date': current,
+#                     'page': c,
+#                     '_status': 0
+#                 })
+
+#         if c > 3: break
+
+#         next_url = driver.current_url
+#         next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
+#         driver.get(next_url)
+#         print(next_url)
+#         # try:
+#         # page_down_(driver, 3)
+#         # next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
+#         # driver.get(next_url)
+#         # print(next_url)
+#         # driver.implicitly_wait(5)
+#             # ActionChains(driver).move_to_element(element).click(element).perform()
+#         # except:
+#         #     print('done')
+#         #     break
+#     logger_.info('{} news list update'.format(q))
+#     return driver 
+
 
 
 def our_rule(url, company, driver):
 def our_rule(url, company, driver):
     url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
     url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
@@ -252,6 +255,54 @@ def get_next_job(db, table, query_key):
     return url_pd
     return url_pd
 
 
 
 
+def get_trends(q, url_table, id_cache, driver, url):
+    
+    driver.get(url)
+    time.sleep(3)
+
+    print(driver.current_url)
+    # print(driver.page_source)
+    elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
+    print(elmts)
+    for elmt in elmts:
+        title, url, company = '', '', ''
+        e = elmt.find_element_by_xpath(".//div[@role='heading']")
+        title = e.text
+        # print(title)
+
+        url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
+        # print(url)
+
+        company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
+        # print(company)
+
+        day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
+        day = conv_time(day)
+        # print(day)
+
+        print(title, url, company, day)
+        
+        if url not in id_cache:
+            url_table.insert({
+                'title': title,
+                'url': url,
+                'keyword': q,
+                'company': company,
+                'day': str(day),
+                'crawler_date': current,
+                'page': c,
+                '_status': 0
+            })
+
+    # next_url = driver.current_url
+    # next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
+    # driver.get(next_url)
+    # print(next_url)
+    next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
+
+    logger_.info('{} news list update'.format(q))
+    return driver, next_url
+
 def main():
 def main():
 
 
     if len(sys.argv) > 1 :
     if len(sys.argv) > 1 :
@@ -275,14 +326,18 @@ def main():
 
 
     # find new news url
     # find new news url
     id_cache = build_cache(db, url_table_name)
     id_cache = build_cache(db, url_table_name)
-    driver = get_trends(query_key, url_table, id_cache, driver)
-    time.sleep(5)
-
-    url_pd = get_next_job(db, url_table_name, query_key)
-    logger_.info('find {} news...'.format(len(url_pd)))
-    
-    driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
-    logger_.info('{} news description update'.format(query_key))
+    url = "https://www.google.com/search?q={}&tbm=nws".format(query_key)
+    # url = "https://www.google.com"
+    for i in range(3):
+        logger_.info('page {}'.format(i+1))
+        driver, url = get_trends(query_key, url_table, id_cache, driver, url)
+        time.sleep(5)
+
+        url_pd = get_next_job(db, url_table_name, query_key)
+        logger_.info('find {} news...'.format(len(url_pd)))
+        
+        driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
+        logger_.info('{} news description update'.format(query_key))
 
 
     db.close()
     db.close()
     driver.close()
     driver.close()