|
@@ -1,9 +1,11 @@
|
|
|
from selenium import webdriver
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
import time, pickle, sys, os, re, time, requests
|
|
|
import dataset
|
|
|
+import traceback
|
|
|
import pandas as pd
|
|
|
from datetime import datetime, timedelta
|
|
|
from newspaper import Article
|
|
@@ -60,79 +62,80 @@ def conv_time(t):
|
|
|
return s
|
|
|
|
|
|
|
|
|
-def get_trends(q, url_table, id_cache, driver):
|
|
|
-
|
|
|
- driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
|
|
|
- time.sleep(3)
|
|
|
- # print(driver.page_source)
|
|
|
- # click tool
|
|
|
- # element = driver.find_element(By.ID, "hdtb-tls")
|
|
|
- # driver.implicitly_wait(5)
|
|
|
- # ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
-
|
|
|
- # click time
|
|
|
- # element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
|
|
|
- # driver.implicitly_wait(5)
|
|
|
- # ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
|
|
|
-
|
|
|
- # click time
|
|
|
- # element = driver.find_element(By.XPATH,"//div[@id='lb']")
|
|
|
- # ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
|
|
|
- # for e in ele:
|
|
|
- # if e.text == '過去 24 小時':
|
|
|
- # print(e.text)
|
|
|
- # driver.implicitly_wait(5)
|
|
|
- # ActionChains(driver).move_to_element(e).click(e).perform()
|
|
|
- # break
|
|
|
-
|
|
|
- c = 0
|
|
|
- while True:
|
|
|
- time.sleep(3)
|
|
|
- c += 1
|
|
|
- logger_.info('page {}'.format(c))
|
|
|
- print(driver.page_source)
|
|
|
- elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
|
|
|
- print(elmts)
|
|
|
- for elmt in elmts:
|
|
|
- title, url, company = '', '', ''
|
|
|
- e = elmt.find_element_by_xpath(".//div[@role='heading']")
|
|
|
- title = e.text
|
|
|
- # print(title)
|
|
|
-
|
|
|
- url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
|
|
|
- # print(url)
|
|
|
-
|
|
|
- company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
|
|
|
- # print(company)
|
|
|
-
|
|
|
- day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
|
|
|
- day = conv_time(day)
|
|
|
- # print(day)
|
|
|
-
|
|
|
- print(title, url, company, day)
|
|
|
-
|
|
|
- if url not in id_cache:
|
|
|
- url_table.insert({
|
|
|
- 'title': title,
|
|
|
- 'url': url,
|
|
|
- 'keyword': q,
|
|
|
- 'company': company,
|
|
|
- 'day': str(day),
|
|
|
- 'crawler_date': current,
|
|
|
- 'page': c,
|
|
|
- '_status': 0
|
|
|
- })
|
|
|
+def page_down_(driver, time_):
|
|
|
|
|
|
- if c > 3: break
|
|
|
+ for i in range(time_):
|
|
|
try:
|
|
|
- element = driver.find_element_by_xpath("//a[@id='pnnext']")
|
|
|
- driver.implicitly_wait(5)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.send_keys(Keys.PAGE_DOWN).perform()
|
|
|
except:
|
|
|
- print('done')
|
|
|
- break
|
|
|
- logger_.info('{} news list update'.format(q))
|
|
|
- return driver
|
|
|
+ traceback.print_exc()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+# def get_trends(q, url_table, id_cache, driver):
|
|
|
+
|
|
|
+# driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
|
|
|
+# time.sleep(3)
|
|
|
+# driver.refresh()
|
|
|
+# c = 0
|
|
|
+# while True:
|
|
|
+# time.sleep(3)
|
|
|
+# c += 1
|
|
|
+# logger_.info('page {}'.format(c))
|
|
|
+# print(driver.current_url)
|
|
|
+# # print(driver.page_source)
|
|
|
+# elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
|
|
|
+# print(elmts)
|
|
|
+# for elmt in elmts:
|
|
|
+# title, url, company = '', '', ''
|
|
|
+# e = elmt.find_element_by_xpath(".//div[@role='heading']")
|
|
|
+# title = e.text
|
|
|
+# # print(title)
|
|
|
+
|
|
|
+# url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
|
|
|
+# # print(url)
|
|
|
+
|
|
|
+# company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
|
|
|
+# # print(company)
|
|
|
+
|
|
|
+# day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
|
|
|
+# day = conv_time(day)
|
|
|
+# # print(day)
|
|
|
+
|
|
|
+# print(title, url, company, day)
|
|
|
+
|
|
|
+# if url not in id_cache:
|
|
|
+# url_table.insert({
|
|
|
+# 'title': title,
|
|
|
+# 'url': url,
|
|
|
+# 'keyword': q,
|
|
|
+# 'company': company,
|
|
|
+# 'day': str(day),
|
|
|
+# 'crawler_date': current,
|
|
|
+# 'page': c,
|
|
|
+# '_status': 0
|
|
|
+# })
|
|
|
+
|
|
|
+# if c > 3: break
|
|
|
+
|
|
|
+# next_url = driver.current_url
|
|
|
+# next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
|
|
|
+# driver.get(next_url)
|
|
|
+# print(next_url)
|
|
|
+# # try:
|
|
|
+# # page_down_(driver, 3)
|
|
|
+# # next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
|
|
|
+# # driver.get(next_url)
|
|
|
+# # print(next_url)
|
|
|
+# # driver.implicitly_wait(5)
|
|
|
+# # ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+# # except:
|
|
|
+# # print('done')
|
|
|
+# # break
|
|
|
+# logger_.info('{} news list update'.format(q))
|
|
|
+# return driver
|
|
|
+
|
|
|
|
|
|
def our_rule(url, company, driver):
|
|
|
url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
|
|
@@ -252,6 +255,54 @@ def get_next_job(db, table, query_key):
|
|
|
return url_pd
|
|
|
|
|
|
|
|
|
+def get_trends(q, url_table, id_cache, driver, url):
|
|
|
+
|
|
|
+ driver.get(url)
|
|
|
+ time.sleep(3)
|
|
|
+
|
|
|
+ print(driver.current_url)
|
|
|
+ # print(driver.page_source)
|
|
|
+ elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
|
|
|
+ print(elmts)
|
|
|
+ for elmt in elmts:
|
|
|
+ title, url, company = '', '', ''
|
|
|
+ e = elmt.find_element_by_xpath(".//div[@role='heading']")
|
|
|
+ title = e.text
|
|
|
+ # print(title)
|
|
|
+
|
|
|
+ url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
|
|
|
+ # print(url)
|
|
|
+
|
|
|
+ company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
|
|
|
+ # print(company)
|
|
|
+
|
|
|
+ day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
|
|
|
+ day = conv_time(day)
|
|
|
+ # print(day)
|
|
|
+
|
|
|
+ print(title, url, company, day)
|
|
|
+
|
|
|
+ if url not in id_cache:
|
|
|
+ url_table.insert({
|
|
|
+ 'title': title,
|
|
|
+ 'url': url,
|
|
|
+ 'keyword': q,
|
|
|
+ 'company': company,
|
|
|
+ 'day': str(day),
|
|
|
+ 'crawler_date': current,
|
|
|
+ 'page': c,
|
|
|
+ '_status': 0
|
|
|
+ })
|
|
|
+
|
|
|
+ # next_url = driver.current_url
|
|
|
+ # next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
|
|
|
+ # driver.get(next_url)
|
|
|
+ # print(next_url)
|
|
|
+ next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
|
|
|
+
|
|
|
+ logger_.info('{} news list update'.format(q))
|
|
|
+ return driver, next_url
|
|
|
+
|
|
|
def main():
|
|
|
|
|
|
if len(sys.argv) > 1 :
|
|
@@ -275,14 +326,18 @@ def main():
|
|
|
|
|
|
# find new news url
|
|
|
id_cache = build_cache(db, url_table_name)
|
|
|
- driver = get_trends(query_key, url_table, id_cache, driver)
|
|
|
- time.sleep(5)
|
|
|
-
|
|
|
- url_pd = get_next_job(db, url_table_name, query_key)
|
|
|
- logger_.info('find {} news...'.format(len(url_pd)))
|
|
|
-
|
|
|
- driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
|
|
|
- logger_.info('{} news description update'.format(query_key))
|
|
|
+ url = "https://www.google.com/search?q={}&tbm=nws".format(query_key)
|
|
|
+ # url = "https://www.google.com"
|
|
|
+ for i in range(3):
|
|
|
+ logger_.info('page {}'.format(i+1))
|
|
|
+ driver, url = get_trends(query_key, url_table, id_cache, driver, url)
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ url_pd = get_next_job(db, url_table_name, query_key)
|
|
|
+ logger_.info('find {} news...'.format(len(url_pd)))
|
|
|
+
|
|
|
+ driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
|
|
|
+ logger_.info('{} news description update'.format(query_key))
|
|
|
|
|
|
db.close()
|
|
|
driver.close()
|