#import urllib.request import urllib import requests import traceback from bs4 import BeautifulSoup as Soup #from fp.fp import FreeProxy import socks import ssl import socket class GoogleNews(): def __init__(self): self.texts = [] self.links = [] self.results = [] self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0' self.headers = {'User-Agent': self.user_agent} # self.proxy = FreeProxy().get() def search(self, key): self.key = "+".join(key.split(" ")) self.getpage() def getpage(self, page=1): self.key=urllib.parse.urlencode({'q':self.key}) print(self.key) self.url = "https://www.google.com/search?gl=tw&hl=zh-tw&" + self.key + "&tbm=nws&start=%d" % (10 * (page - 1)) try: print(self.url) # proxy_support = urllib.request.ProxyHandler(self.proxy) # opener = urllib.request.build_opener(proxy_support) # urllib.request.install_opener(opener) self.req = urllib.request.Request(self.url, headers=self.headers) # self.req = urllib.request.Request(self.url, headers=self.headers) ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE socks.set_default_proxy(socks.SOCKS5, '172.104.67.159', 8180) socket.socket = socks.socksocket self.response = urllib.request.urlopen(self.req, context=ctx) # self.response = urllib.request.urlopen(self.url) print('before') #self.response=requests.get(self.url) self.page = self.response.read().decode('utf-8') # print(self.page) # self.page = self.response.decode('utf-8') print('after') self.content = Soup(self.page, "html.parser") # result = self.content.find_all("div", class_="g") result = self.content.find_all("div", class_="dbsr") for item in result: # print(item) # self.texts.append(item.find("h3").text) link=None img=None try: # link=item.find("h3").find("a").get("href") link=item.find("a").get("href") print(item.find("a").text) #title=lin self.links.append(link) except: link=None print('no linkes') continue try: img=item.find("img").get("src") except: img=None print('no img') try: self.results.append( # {"title":item.find("h3").text, 'link':link}) {"title":'title', 'link':link}) # {'title': item.find("h3").text, 'media': item.find("div", class_="slp").find_all("span")[0].text, # 'date': item.find("div", class_="slp").find_all("span")[2].text, # 'desc': item.find("div", class_="st").text, 'link': link,'img':img # }) except: print('exp') self.response.close() except Exception as e: traceback.print_exc() print(e) pass def get_news(self, deamplify=False): self.url = 'https://news.google.com/' try: # self.req = urllib.request.Request(self.url, headers=self.headers) # self.response = urllib.request.urlopen(self.req) self.response=requests.get(self.url) # print(self.response) self.page = self.response.text print(self.page) self.page = self.response.read() self.content = Soup(self.page, "html.parser") result = self.content.find_all("article") for item in result: try: title = item.find("h3").text if deamplify: try: link = item.find("a").get("jslog").split('2:')[1].split(';')[0] except Exception as e: print(e) link = item.find("h3").find("a").get("href") else: link = item.find("h3").find("a").get("href") self.texts.append(title) self.links.append(link) self.results.append( {'title': title, 'datetime': item.find("time").get("datetime"), 'time': item.find("time").text, 'desc': item.find("h3").next_sibling.text, 'link': link, 'media': None, 'img': item.previous_sibling.find("img").get("src")}) except Exception as e: pass self.response.close() except Exception as e: print(e) pass def result(self): return self.results def gettext(self): return self.texts def getlinks(self): return self.links def clear(self): self.texts = [] self.links = [] self.results = []