il y a 3 ans · 93864cb2fb
--- a/gnews_keyword.py
+++ b/gnews_keyword.py
@@ -1,9 +1,11 @@
 
				 from selenium import webdriver
			
 
				 from selenium.webdriver.common.by import By
			
 
				 from selenium.webdriver.common.action_chains import ActionChains
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				 from selenium.webdriver.support.wait import WebDriverWait
			
 
				 import time, pickle, sys, os, re, time, requests
			
 
				 import dataset
			
 
				+import traceback
			
 
				 import pandas as pd
			
 
				 from datetime import datetime, timedelta
			
 
				 from newspaper import Article
			
@@ -60,79 +62,80 @@ def conv_time(t):
 
				     return s
			
 
				 
			
 
				 
			
 
				-def get_trends(q, url_table, id_cache, driver):
			
 
				-    
			
 
				-    driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
			
 
				-    time.sleep(3)
			
 
				-    # print(driver.page_source)
			
 
				-    # click tool
			
 
				-    # element = driver.find_element(By.ID, "hdtb-tls")
			
 
				-    # driver.implicitly_wait(5)
			
 
				-    # ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				-
			
 
				-    # click time
			
 
				-    # element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
			
 
				-    # driver.implicitly_wait(5)
			
 
				-    # ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
			
 
				-
			
 
				-    # click time
			
 
				-    # element = driver.find_element(By.XPATH,"//div[@id='lb']")
			
 
				-    # ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
			
 
				-    # for e in ele:
			
 
				-    #     if e.text == '過去 24 小時':
			
 
				-    #         print(e.text)
			
 
				-    #         driver.implicitly_wait(5)
			
 
				-    #         ActionChains(driver).move_to_element(e).click(e).perform()
			
 
				-    #         break
			
 
				-
			
 
				-    c = 0
			
 
				-    while True:
			
 
				-        time.sleep(3)
			
 
				-        c += 1
			
 
				-        logger_.info('page {}'.format(c))
			
 
				-        print(driver.page_source)
			
 
				-        elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
			
 
				-        print(elmts)
			
 
				-        for elmt in elmts:
			
 
				-            title, url, company = '', '', ''
			
 
				-            e = elmt.find_element_by_xpath(".//div[@role='heading']")
			
 
				-            title = e.text
			
 
				-            # print(title)
			
 
				-
			
 
				-            url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
			
 
				-            # print(url)
			
 
				-
			
 
				-            company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
			
 
				-            # print(company)
			
 
				-
			
 
				-            day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
			
 
				-            day = conv_time(day)
			
 
				-            # print(day)
			
 
				-
			
 
				-            print(title, url, company, day)
			
 
				-            
			
 
				-            if url not in id_cache:
			
 
				-                url_table.insert({
			
 
				-                    'title': title,
			
 
				-                    'url': url,
			
 
				-                    'keyword': q,
			
 
				-                    'company': company,
			
 
				-                    'day': str(day),
			
 
				-                    'crawler_date': current,
			
 
				-                    'page': c,
			
 
				-                    '_status': 0
			
 
				-                })
			
 
				+def page_down_(driver, time_):
			
 
				 
			
 
				-            if c > 3: break
			
 
				+    for i in range(time_):
			
 
				         try:
			
 
				-            element = driver.find_element_by_xpath("//a[@id='pnnext']")
			
 
				-            driver.implicitly_wait(5)
			
 
				-            ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				+            actions = ActionChains(driver)
			
 
				+            actions.send_keys(Keys.PAGE_DOWN).perform()
			
 
				         except:
			
 
				-            print('done')
			
 
				-            break
			
 
				-    logger_.info('{} news list update'.format(q))
			
 
				-    return driver 
			
 
				+            traceback.print_exc()
			
 
				+        time.sleep(0.5)
			
 
				+
			
 
				+
			
 
				+# def get_trends(q, url_table, id_cache, driver):
			
 
				+    
			
 
				+#     driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
			
 
				+#     time.sleep(3)
			
 
				+#     driver.refresh()
			
 
				+#     c = 0
			
 
				+#     while True:
			
 
				+#         time.sleep(3)
			
 
				+#         c += 1
			
 
				+#         logger_.info('page {}'.format(c))
			
 
				+#         print(driver.current_url)
			
 
				+#         # print(driver.page_source)
			
 
				+#         elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
			
 
				+#         print(elmts)
			
 
				+#         for elmt in elmts:
			
 
				+#             title, url, company = '', '', ''
			
 
				+#             e = elmt.find_element_by_xpath(".//div[@role='heading']")
			
 
				+#             title = e.text
			
 
				+#             # print(title)
			
 
				+
			
 
				+#             url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
			
 
				+#             # print(url)
			
 
				+
			
 
				+#             company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
			
 
				+#             # print(company)
			
 
				+
			
 
				+#             day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
			
 
				+#             day = conv_time(day)
			
 
				+#             # print(day)
			
 
				+
			
 
				+#             print(title, url, company, day)
			
 
				+            
			
 
				+#             if url not in id_cache:
			
 
				+#                 url_table.insert({
			
 
				+#                     'title': title,
			
 
				+#                     'url': url,
			
 
				+#                     'keyword': q,
			
 
				+#                     'company': company,
			
 
				+#                     'day': str(day),
			
 
				+#                     'crawler_date': current,
			
 
				+#                     'page': c,
			
 
				+#                     '_status': 0
			
 
				+#                 })
			
 
				+
			
 
				+#         if c > 3: break
			
 
				+
			
 
				+#         next_url = driver.current_url
			
 
				+#         next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
			
 
				+#         driver.get(next_url)
			
 
				+#         print(next_url)
			
 
				+#         # try:
			
 
				+#         # page_down_(driver, 3)
			
 
				+#         # next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
			
 
				+#         # driver.get(next_url)
			
 
				+#         # print(next_url)
			
 
				+#         # driver.implicitly_wait(5)
			
 
				+#             # ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				+#         # except:
			
 
				+#         #     print('done')
			
 
				+#         #     break
			
 
				+#     logger_.info('{} news list update'.format(q))
			
 
				+#     return driver 
			
 
				+
			
 
				 
			
 
				 def our_rule(url, company, driver):
			
 
				     url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
			
@@ -252,6 +255,54 @@ def get_next_job(db, table, query_key):
 
				     return url_pd
			
 
				 
			
 
				 
			
 
				+def get_trends(q, url_table, id_cache, driver, url):
			
 
				+    
			
 
				+    driver.get(url)
			
 
				+    time.sleep(3)
			
 
				+
			
 
				+    print(driver.current_url)
			
 
				+    # print(driver.page_source)
			
 
				+    elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
			
 
				+    print(elmts)
			
 
				+    for elmt in elmts:
			
 
				+        title, url, company = '', '', ''
			
 
				+        e = elmt.find_element_by_xpath(".//div[@role='heading']")
			
 
				+        title = e.text
			
 
				+        # print(title)
			
 
				+
			
 
				+        url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
			
 
				+        # print(url)
			
 
				+
			
 
				+        company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
			
 
				+        # print(company)
			
 
				+
			
 
				+        day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
			
 
				+        day = conv_time(day)
			
 
				+        # print(day)
			
 
				+
			
 
				+        print(title, url, company, day)
			
 
				+        
			
 
				+        if url not in id_cache:
			
 
				+            url_table.insert({
			
 
				+                'title': title,
			
 
				+                'url': url,
			
 
				+                'keyword': q,
			
 
				+                'company': company,
			
 
				+                'day': str(day),
			
 
				+                'crawler_date': current,
			
 
				+                'page': c,
			
 
				+                '_status': 0
			
 
				+            })
			
 
				+
			
 
				+    # next_url = driver.current_url
			
 
				+    # next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
			
 
				+    # driver.get(next_url)
			
 
				+    # print(next_url)
			
 
				+    next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
			
 
				+
			
 
				+    logger_.info('{} news list update'.format(q))
			
 
				+    return driver, next_url
			
 
				+
			
 
				 def main():
			
 
				 
			
 
				     if len(sys.argv) > 1 :
			
@@ -275,14 +326,18 @@ def main():
 
				 
			
 
				     # find new news url
			
 
				     id_cache = build_cache(db, url_table_name)
			
 
				-    driver = get_trends(query_key, url_table, id_cache, driver)
			
 
				-    time.sleep(5)
			
 
				-
			
 
				-    url_pd = get_next_job(db, url_table_name, query_key)
			
 
				-    logger_.info('find {} news...'.format(len(url_pd)))
			
 
				-    
			
 
				-    driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
			
 
				-    logger_.info('{} news description update'.format(query_key))
			
 
				+    url = "https://www.google.com/search?q={}&tbm=nws".format(query_key)
			
 
				+    # url = "https://www.google.com"
			
 
				+    for i in range(3):
			
 
				+        logger_.info('page {}'.format(i+1))
			
 
				+        driver, url = get_trends(query_key, url_table, id_cache, driver, url)
			
 
				+        time.sleep(5)
			
 
				+
			
 
				+        url_pd = get_next_job(db, url_table_name, query_key)
			
 
				+        logger_.info('find {} news...'.format(len(url_pd)))
			
 
				+        
			
 
				+        driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
			
 
				+        logger_.info('{} news description update'.format(query_key))
			
 
				 
			
 
				     db.close()
			
 
				     driver.close()