3 年之前 · 93864cb2fb
--- a/gnews_keyword.py
+++ b/gnews_keyword.py
@@ -1,9 +1,11 @@
 
															 from selenium import webdriver
														
 
															 from selenium.webdriver.common.by import By
														
 
															 from selenium.webdriver.common.action_chains import ActionChains
														
 
															+from selenium.webdriver.common.keys import Keys
														
 
															 from selenium.webdriver.support.wait import WebDriverWait
														
 
															 import time, pickle, sys, os, re, time, requests
														
 
															 import dataset
														
 
															+import traceback
														
 
															 import pandas as pd
														
 
															 from datetime import datetime, timedelta
														
 
															 from newspaper import Article
														
@@ -60,79 +62,80 @@ def conv_time(t):
 
															     return s
														
 
															-def get_trends(q, url_table, id_cache, driver):
														
 
															-    
														
 
															-    driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
														
 
															-    time.sleep(3)
														
 
															-    # print(driver.page_source)
														
 
															-    # click tool
														
 
															-    # element = driver.find_element(By.ID, "hdtb-tls")
														
 
															-    # driver.implicitly_wait(5)
														
 
															-    # ActionChains(driver).move_to_element(element).click(element).perform()
														
 
															-
														
 
															-    # click time
														
 
															-    # element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
														
 
															-    # driver.implicitly_wait(5)
														
 
															-    # ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
														
 
															-
														
 
															-    # click time
														
 
															-    # element = driver.find_element(By.XPATH,"//div[@id='lb']")
														
 
															-    # ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
														
 
															-    # for e in ele:
														
 
															-    #     if e.text == '過去 24 小時':
														
 
															-    #         print(e.text)
														
 
															-    #         driver.implicitly_wait(5)
														
 
															-    #         ActionChains(driver).move_to_element(e).click(e).perform()
														
 
															-    #         break
														
 
															-
														
 
															-    c = 0
														
 
															-    while True:
														
 
															-        time.sleep(3)
														
 
															-        c += 1
														
 
															-        logger_.info('page {}'.format(c))
														
 
															-        print(driver.page_source)
														
 
															-        elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
														
 
															-        print(elmts)
														
 
															-        for elmt in elmts:
														
 
															-            title, url, company = '', '', ''
														
 
															-            e = elmt.find_element_by_xpath(".//div[@role='heading']")
														
 
															-            title = e.text
														
 
															-            # print(title)
														
 
															-
														
 
															-            url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
														
 
															-            # print(url)
														
 
															-
														
 
															-            company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
														
 
															-            # print(company)
														
 
															-
														
 
															-            day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
														
 
															-            day = conv_time(day)
														
 
															-            # print(day)
														
 
															-
														
 
															-            print(title, url, company, day)
														
 
															-            
														
 
															-            if url not in id_cache:
														
 
															-                url_table.insert({
														
 
															-                    'title': title,
														
 
															-                    'url': url,
														
 
															-                    'keyword': q,
														
 
															-                    'company': company,
														
 
															-                    'day': str(day),
														
 
															-                    'crawler_date': current,
														
 
															-                    'page': c,
														
 
															-                    '_status': 0
														
 
															-                })
														
 
															+def page_down_(driver, time_):
														
 
															-            if c > 3: break
														
 
															+    for i in range(time_):
														
 
															         try:
														
 
															-            element = driver.find_element_by_xpath("//a[@id='pnnext']")
														
 
															-            driver.implicitly_wait(5)
														
 
															-            ActionChains(driver).move_to_element(element).click(element).perform()
														
 
															+            actions = ActionChains(driver)
														
 
															+            actions.send_keys(Keys.PAGE_DOWN).perform()
														
 
															         except:
														
 
															-            print('done')
														
 
															-            break
														
 
															-    logger_.info('{} news list update'.format(q))
														
 
															-    return driver 
														
 
															+            traceback.print_exc()
														
 
															+        time.sleep(0.5)
														
 
															+
														
 
															+
														
 
															+# def get_trends(q, url_table, id_cache, driver):
														
 
															+    
														
 
															+#     driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
														
 
															+#     time.sleep(3)
														
 
															+#     driver.refresh()
														
 
															+#     c = 0
														
 
															+#     while True:
														
 
															+#         time.sleep(3)
														
 
															+#         c += 1
														
 
															+#         logger_.info('page {}'.format(c))
														
 
															+#         print(driver.current_url)
														
 
															+#         # print(driver.page_source)
														
 
															+#         elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
														
 
															+#         print(elmts)
														
 
															+#         for elmt in elmts:
														
 
															+#             title, url, company = '', '', ''
														
 
															+#             e = elmt.find_element_by_xpath(".//div[@role='heading']")
														
 
															+#             title = e.text
														
 
															+#             # print(title)
														
 
															+
														
 
															+#             url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
														
 
															+#             # print(url)
														
 
															+
														
 
															+#             company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
														
 
															+#             # print(company)
														
 
															+
														
 
															+#             day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
														
 
															+#             day = conv_time(day)
														
 
															+#             # print(day)
														
 
															+
														
 
															+#             print(title, url, company, day)
														
 
															+            
														
 
															+#             if url not in id_cache:
														
 
															+#                 url_table.insert({
														
 
															+#                     'title': title,
														
 
															+#                     'url': url,
														
 
															+#                     'keyword': q,
														
 
															+#                     'company': company,
														
 
															+#                     'day': str(day),
														
 
															+#                     'crawler_date': current,
														
 
															+#                     'page': c,
														
 
															+#                     '_status': 0
														
 
															+#                 })
														
 
															+
														
 
															+#         if c > 3: break
														
 
															+
														
 
															+#         next_url = driver.current_url
														
 
															+#         next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
														
 
															+#         driver.get(next_url)
														
 
															+#         print(next_url)
														
 
															+#         # try:
														
 
															+#         # page_down_(driver, 3)
														
 
															+#         # next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
														
 
															+#         # driver.get(next_url)
														
 
															+#         # print(next_url)
														
 
															+#         # driver.implicitly_wait(5)
														
 
															+#             # ActionChains(driver).move_to_element(element).click(element).perform()
														
 
															+#         # except:
														
 
															+#         #     print('done')
														
 
															+#         #     break
														
 
															+#     logger_.info('{} news list update'.format(q))
														
 
															+#     return driver 
														
 
															+
														
 
															 def our_rule(url, company, driver):
														
 
															     url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
														
@@ -252,6 +255,54 @@ def get_next_job(db, table, query_key):
 
															     return url_pd
														
 
															+def get_trends(q, url_table, id_cache, driver, url):
														
 
															+    
														
 
															+    driver.get(url)
														
 
															+    time.sleep(3)
														
 
															+
														
 
															+    print(driver.current_url)
														
 
															+    # print(driver.page_source)
														
 
															+    elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
														
 
															+    print(elmts)
														
 
															+    for elmt in elmts:
														
 
															+        title, url, company = '', '', ''
														
 
															+        e = elmt.find_element_by_xpath(".//div[@role='heading']")
														
 
															+        title = e.text
														
 
															+        # print(title)
														
 
															+
														
 
															+        url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
														
 
															+        # print(url)
														
 
															+
														
 
															+        company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
														
 
															+        # print(company)
														
 
															+
														
 
															+        day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
														
 
															+        day = conv_time(day)
														
 
															+        # print(day)
														
 
															+
														
 
															+        print(title, url, company, day)
														
 
															+        
														
 
															+        if url not in id_cache:
														
 
															+            url_table.insert({
														
 
															+                'title': title,
														
 
															+                'url': url,
														
 
															+                'keyword': q,
														
 
															+                'company': company,
														
 
															+                'day': str(day),
														
 
															+                'crawler_date': current,
														
 
															+                'page': c,
														
 
															+                '_status': 0
														
 
															+            })
														
 
															+
														
 
															+    # next_url = driver.current_url
														
 
															+    # next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
														
 
															+    # driver.get(next_url)
														
 
															+    # print(next_url)
														
 
															+    next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
														
 
															+
														
 
															+    logger_.info('{} news list update'.format(q))
														
 
															+    return driver, next_url
														
 
															+
														
 
															 def main():
														
 
															     if len(sys.argv) > 1 :
														
@@ -275,14 +326,18 @@ def main():
 
															     # find new news url
														
 
															     id_cache = build_cache(db, url_table_name)
														
 
															-    driver = get_trends(query_key, url_table, id_cache, driver)
														
 
															-    time.sleep(5)
														
 
															-
														
 
															-    url_pd = get_next_job(db, url_table_name, query_key)
														
 
															-    logger_.info('find {} news...'.format(len(url_pd)))
														
 
															-    
														
 
															-    driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
														
 
															-    logger_.info('{} news description update'.format(query_key))
														
 
															+    url = "https://www.google.com/search?q={}&tbm=nws".format(query_key)
														
 
															+    # url = "https://www.google.com"
														
 
															+    for i in range(3):
														
 
															+        logger_.info('page {}'.format(i+1))
														
 
															+        driver, url = get_trends(query_key, url_table, id_cache, driver, url)
														
 
															+        time.sleep(5)
														
 
															+
														
 
															+        url_pd = get_next_job(db, url_table_name, query_key)
														
 
															+        logger_.info('find {} news...'.format(len(url_pd)))
														
 
															+        
														
 
															+        driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
														
 
															+        logger_.info('{} news description update'.format(query_key))
														
 
															     db.close()
														
 
															     driver.close()