|
@@ -0,0 +1,161 @@
|
|
|
+#import urllib.request
|
|
|
+import urllib
|
|
|
+import requests
|
|
|
+import traceback
|
|
|
+from bs4 import BeautifulSoup as Soup
|
|
|
+#from fp.fp import FreeProxy
|
|
|
+import socks
|
|
|
+import ssl
|
|
|
+import socket
|
|
|
+class GoogleNews:
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.texts = []
|
|
|
+ self.links = []
|
|
|
+ self.results = []
|
|
|
+ self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
|
|
|
+ self.headers = {'User-Agent': self.user_agent}
|
|
|
+# self.proxy = FreeProxy().get()
|
|
|
+
|
|
|
+ def search(self, key):
|
|
|
+ self.key = "+".join(key.split(" "))
|
|
|
+ self.getpage()
|
|
|
+
|
|
|
+ def getpage(self, page=1):
|
|
|
+ self.key=urllib.parse.urlencode({'q':self.key})
|
|
|
+ print(self.key)
|
|
|
+ self.url = "https://www.google.com/search?" + self.key + "&tbm=nws&start=%d" % (10 * (page - 1))
|
|
|
+ try:
|
|
|
+ print(self.url)
|
|
|
+# proxy_support = urllib.request.ProxyHandler(self.proxy)
|
|
|
+# opener = urllib.request.build_opener(proxy_support)
|
|
|
+# urllib.request.install_opener(opener)
|
|
|
+ self.req = urllib.request.Request(self.url, headers=self.headers)
|
|
|
+# self.req = urllib.request.Request(self.url, headers=self.headers)
|
|
|
+
|
|
|
+ ctx = ssl.create_default_context()
|
|
|
+ ctx.check_hostname = False
|
|
|
+ ctx.verify_mode = ssl.CERT_NONE
|
|
|
+
|
|
|
+ socks.set_default_proxy(socks.SOCKS5, '172.104.67.159', 8180)
|
|
|
+ socket.socket = socks.socksocket
|
|
|
+
|
|
|
+
|
|
|
+ self.response = urllib.request.urlopen(self.req, context=ctx)
|
|
|
+# self.response = urllib.request.urlopen(self.url)
|
|
|
+
|
|
|
+ print('before')
|
|
|
+
|
|
|
+ #self.response=requests.get(self.url)
|
|
|
+ self.page = self.response.read().decode('utf-8')
|
|
|
+# print(self.page)
|
|
|
+# self.page = self.response.decode('utf-8')
|
|
|
+ print('after')
|
|
|
+ self.content = Soup(self.page, "html.parser")
|
|
|
+
|
|
|
+
|
|
|
+# result = self.content.find_all("div", class_="g")
|
|
|
+ result = self.content.find_all("div", class_="dbsr")
|
|
|
+
|
|
|
+ for item in result:
|
|
|
+# print(item)
|
|
|
+# self.texts.append(item.find("h3").text)
|
|
|
+ link=None
|
|
|
+ img=None
|
|
|
+
|
|
|
+ try:
|
|
|
+# link=item.find("h3").find("a").get("href")
|
|
|
+ link=item.find("a").get("href")
|
|
|
+ print(item.find("a").text)
|
|
|
+
|
|
|
+ #title=lin
|
|
|
+ self.links.append(link)
|
|
|
+ except:
|
|
|
+ link=None
|
|
|
+ print('no linkes')
|
|
|
+ continue
|
|
|
+
|
|
|
+ try:
|
|
|
+ img=item.find("img").get("src")
|
|
|
+ except:
|
|
|
+ img=None
|
|
|
+ print('no img')
|
|
|
+
|
|
|
+
|
|
|
+ try:
|
|
|
+ self.results.append(
|
|
|
+# {"title":item.find("h3").text, 'link':link})
|
|
|
+ {"title":'title', 'link':link})
|
|
|
+
|
|
|
+# {'title': item.find("h3").text, 'media': item.find("div", class_="slp").find_all("span")[0].text,
|
|
|
+# 'date': item.find("div", class_="slp").find_all("span")[2].text,
|
|
|
+# 'desc': item.find("div", class_="st").text, 'link': link,'img':img
|
|
|
+# })
|
|
|
+ except:
|
|
|
+ print('exp')
|
|
|
+
|
|
|
+ self.response.close()
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ print(e)
|
|
|
+ pass
|
|
|
+
|
|
|
+ def get_news(self, deamplify=False):
|
|
|
+ self.url = 'https://news.google.com/'
|
|
|
+ try:
|
|
|
+# self.req = urllib.request.Request(self.url, headers=self.headers)
|
|
|
+# self.response = urllib.request.urlopen(self.req)
|
|
|
+ self.response=requests.get(self.url)
|
|
|
+# print(self.response)
|
|
|
+ self.page = self.response.text
|
|
|
+ print(self.page)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ self.page = self.response.read()
|
|
|
+ self.content = Soup(self.page, "html.parser")
|
|
|
+ result = self.content.find_all("article")
|
|
|
+ for item in result:
|
|
|
+ try:
|
|
|
+ title = item.find("h3").text
|
|
|
+ if deamplify:
|
|
|
+ try:
|
|
|
+ link = item.find("a").get("jslog").split('2:')[1].split(';')[0]
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ link = item.find("h3").find("a").get("href")
|
|
|
+ else:
|
|
|
+ link = item.find("h3").find("a").get("href")
|
|
|
+ self.texts.append(title)
|
|
|
+ self.links.append(link)
|
|
|
+ self.results.append(
|
|
|
+ {'title': title,
|
|
|
+ 'datetime': item.find("time").get("datetime"),
|
|
|
+ 'time': item.find("time").text,
|
|
|
+ 'desc': item.find("h3").next_sibling.text,
|
|
|
+ 'link': link,
|
|
|
+ 'media': None,
|
|
|
+ 'img': item.previous_sibling.find("img").get("src")})
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ self.response.close()
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ pass
|
|
|
+
|
|
|
+ def result(self):
|
|
|
+ return self.results
|
|
|
+
|
|
|
+ def gettext(self):
|
|
|
+ return self.texts
|
|
|
+
|
|
|
+ def getlinks(self):
|
|
|
+ return self.links
|
|
|
+
|
|
|
+ def clear(self):
|
|
|
+ self.texts = []
|
|
|
+ self.links = []
|
|
|
+ self.results = []
|
|
|
+
|
|
|
+
|