| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 | #import urllib.requestimport urllibimport requestsimport tracebackfrom bs4 import BeautifulSoup as Soup#from fp.fp import FreeProxyimport socksimport sslimport socketclass GoogleNews():    def __init__(self):        self.texts = []        self.links = []        self.results = []        self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'        self.headers = {'User-Agent': self.user_agent}#        self.proxy = FreeProxy().get()    def search(self, key):        self.key = "+".join(key.split(" "))        self.getpage()    def getpage(self, page=1):        self.key=urllib.parse.urlencode({'q':self.key})        print(self.key)        self.url = "https://www.google.com/search?gl=tw&hl=zh-tw&" + self.key + "&tbm=nws&start=%d" % (10 * (page - 1))        try:            print(self.url)#            proxy_support = urllib.request.ProxyHandler(self.proxy)#            opener = urllib.request.build_opener(proxy_support)#            urllib.request.install_opener(opener)            self.req = urllib.request.Request(self.url, headers=self.headers)#            self.req = urllib.request.Request(self.url, headers=self.headers)            ctx = ssl.create_default_context()            ctx.check_hostname = False            ctx.verify_mode = ssl.CERT_NONE            socks.set_default_proxy(socks.SOCKS5, '172.104.67.159', 8180)            socket.socket = socks.socksocket            self.response = urllib.request.urlopen(self.req, context=ctx)#            self.response = urllib.request.urlopen(self.url)            print('before')                        #self.response=requests.get(self.url)            self.page = self.response.read().decode('utf-8')#            print(self.page)#            self.page = self.response.decode('utf-8')            print('after')            self.content = Soup(self.page, "html.parser")#            result = self.content.find_all("div", class_="g")            result = self.content.find_all("div", class_="dbsr")            for item in result:#                print(item)#                self.texts.append(item.find("h3").text)                link=None                img=None                try:#                    link=item.find("h3").find("a").get("href")                    link=item.find("a").get("href")                    print(item.find("a").text)                    #title=lin                    self.links.append(link)                except:                    link=None                    print('no linkes')                    continue                try:                    img=item.find("img").get("src")                except:                    img=None                    print('no img')                try:                    self.results.append(#                        {"title":item.find("h3").text, 'link':link})                        {"title":'title', 'link':link})#                        {'title': item.find("h3").text, 'media': item.find("div", class_="slp").find_all("span")[0].text,#                        'date': item.find("div", class_="slp").find_all("span")[2].text,#                        'desc': item.find("div", class_="st").text, 'link': link,'img':img#                        })                except:                    print('exp')            self.response.close()        except Exception as e:            traceback.print_exc()            print(e)            pass    def get_news(self, deamplify=False):        self.url = 'https://news.google.com/'        try:#            self.req = urllib.request.Request(self.url, headers=self.headers)#            self.response = urllib.request.urlopen(self.req)            self.response=requests.get(self.url)#            print(self.response)            self.page = self.response.text            print(self.page)            self.page = self.response.read()            self.content = Soup(self.page, "html.parser")            result = self.content.find_all("article")            for item in result:                try:                    title = item.find("h3").text                    if deamplify:                        try:                            link = item.find("a").get("jslog").split('2:')[1].split(';')[0]                        except Exception as e:                            print(e)                            link = item.find("h3").find("a").get("href")                    else:                        link = item.find("h3").find("a").get("href")                    self.texts.append(title)                    self.links.append(link)                    self.results.append(                        {'title': title,                         'datetime': item.find("time").get("datetime"),                         'time': item.find("time").text,                         'desc': item.find("h3").next_sibling.text,                         'link': link,                         'media': None,                         'img': item.previous_sibling.find("img").get("src")})                except Exception as e:                    pass            self.response.close()        except Exception as e:            print(e)            pass    def result(self):        return self.results    def gettext(self):        return self.texts    def getlinks(self):        return self.links    def clear(self):        self.texts = []        self.links = []        self.results = []
 |