123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- #import urllib.request
- import urllib
- import requests
- import traceback
- from bs4 import BeautifulSoup as Soup
- #from fp.fp import FreeProxy
- import socks
- import ssl
- import socket
- class GoogleNews():
- def __init__(self):
- self.texts = []
- self.links = []
- self.results = []
- self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
- self.headers = {'User-Agent': self.user_agent}
- # self.proxy = FreeProxy().get()
- def search(self, key):
- self.key = "+".join(key.split(" "))
- self.getpage()
- def getpage(self, page=1):
- self.key=urllib.parse.urlencode({'q':self.key})
- print(self.key)
- self.url = "https://www.google.com/search?gl=tw&hl=zh-tw&" + self.key + "&tbm=nws&start=%d" % (10 * (page - 1))
- try:
- print(self.url)
- # proxy_support = urllib.request.ProxyHandler(self.proxy)
- # opener = urllib.request.build_opener(proxy_support)
- # urllib.request.install_opener(opener)
- self.req = urllib.request.Request(self.url, headers=self.headers)
- # self.req = urllib.request.Request(self.url, headers=self.headers)
- ctx = ssl.create_default_context()
- ctx.check_hostname = False
- ctx.verify_mode = ssl.CERT_NONE
- socks.set_default_proxy(socks.SOCKS5, '172.104.67.159', 8180)
- socket.socket = socks.socksocket
- self.response = urllib.request.urlopen(self.req, context=ctx)
- # self.response = urllib.request.urlopen(self.url)
- print('before')
-
- #self.response=requests.get(self.url)
- self.page = self.response.read().decode('utf-8')
- # print(self.page)
- # self.page = self.response.decode('utf-8')
- print('after')
- self.content = Soup(self.page, "html.parser")
- # result = self.content.find_all("div", class_="g")
- result = self.content.find_all("div", class_="dbsr")
- for item in result:
- # print(item)
- # self.texts.append(item.find("h3").text)
- link=None
- img=None
- try:
- # link=item.find("h3").find("a").get("href")
- link=item.find("a").get("href")
- print(item.find("a").text)
- #title=lin
- self.links.append(link)
- except:
- link=None
- print('no linkes')
- continue
- try:
- img=item.find("img").get("src")
- except:
- img=None
- print('no img')
- try:
- self.results.append(
- # {"title":item.find("h3").text, 'link':link})
- {"title":'title', 'link':link})
- # {'title': item.find("h3").text, 'media': item.find("div", class_="slp").find_all("span")[0].text,
- # 'date': item.find("div", class_="slp").find_all("span")[2].text,
- # 'desc': item.find("div", class_="st").text, 'link': link,'img':img
- # })
- except:
- print('exp')
- self.response.close()
- except Exception as e:
- traceback.print_exc()
- print(e)
- pass
- def get_news(self, deamplify=False):
- self.url = 'https://news.google.com/'
- try:
- # self.req = urllib.request.Request(self.url, headers=self.headers)
- # self.response = urllib.request.urlopen(self.req)
- self.response=requests.get(self.url)
- # print(self.response)
- self.page = self.response.text
- print(self.page)
- self.page = self.response.read()
- self.content = Soup(self.page, "html.parser")
- result = self.content.find_all("article")
- for item in result:
- try:
- title = item.find("h3").text
- if deamplify:
- try:
- link = item.find("a").get("jslog").split('2:')[1].split(';')[0]
- except Exception as e:
- print(e)
- link = item.find("h3").find("a").get("href")
- else:
- link = item.find("h3").find("a").get("href")
- self.texts.append(title)
- self.links.append(link)
- self.results.append(
- {'title': title,
- 'datetime': item.find("time").get("datetime"),
- 'time': item.find("time").text,
- 'desc': item.find("h3").next_sibling.text,
- 'link': link,
- 'media': None,
- 'img': item.previous_sibling.find("img").get("src")})
- except Exception as e:
- pass
- self.response.close()
- except Exception as e:
- print(e)
- pass
- def result(self):
- return self.results
- def gettext(self):
- return self.texts
- def getlinks(self):
- return self.links
- def clear(self):
- self.texts = []
- self.links = []
- self.results = []
|