GoogleNews2.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup as Soup
  6. #from fp.fp import FreeProxy
  7. import socks
  8. import ssl
  9. import socket
  10. class GoogleNews():
  11. def __init__(self):
  12. self.texts = []
  13. self.links = []
  14. self.results = []
  15. self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
  16. self.headers = {'User-Agent': self.user_agent}
  17. # self.proxy = FreeProxy().get()
  18. def search(self, key):
  19. self.key = "+".join(key.split(" "))
  20. self.getpage()
  21. def getpage(self, page=1):
  22. self.key=urllib.parse.urlencode({'q':self.key})
  23. print(self.key)
  24. self.url = "https://www.google.com/search?gl=tw&hl=zh-tw&" + self.key + "&tbm=nws&start=%d" % (10 * (page - 1))
  25. try:
  26. print(self.url)
  27. # proxy_support = urllib.request.ProxyHandler(self.proxy)
  28. # opener = urllib.request.build_opener(proxy_support)
  29. # urllib.request.install_opener(opener)
  30. self.req = urllib.request.Request(self.url, headers=self.headers)
  31. # self.req = urllib.request.Request(self.url, headers=self.headers)
  32. ctx = ssl.create_default_context()
  33. ctx.check_hostname = False
  34. ctx.verify_mode = ssl.CERT_NONE
  35. socks.set_default_proxy(socks.SOCKS5, '172.104.67.159', 8180)
  36. socket.socket = socks.socksocket
  37. self.response = urllib.request.urlopen(self.req, context=ctx)
  38. # self.response = urllib.request.urlopen(self.url)
  39. print('before')
  40. #self.response=requests.get(self.url)
  41. self.page = self.response.read().decode('utf-8')
  42. # print(self.page)
  43. # self.page = self.response.decode('utf-8')
  44. print('after')
  45. self.content = Soup(self.page, "html.parser")
  46. # result = self.content.find_all("div", class_="g")
  47. result = self.content.find_all("div", class_="dbsr")
  48. for item in result:
  49. # print(item)
  50. # self.texts.append(item.find("h3").text)
  51. link=None
  52. img=None
  53. try:
  54. # link=item.find("h3").find("a").get("href")
  55. link=item.find("a").get("href")
  56. print(item.find("a").text)
  57. #title=lin
  58. self.links.append(link)
  59. except:
  60. link=None
  61. print('no linkes')
  62. continue
  63. try:
  64. img=item.find("img").get("src")
  65. except:
  66. img=None
  67. print('no img')
  68. try:
  69. self.results.append(
  70. # {"title":item.find("h3").text, 'link':link})
  71. {"title":'title', 'link':link})
  72. # {'title': item.find("h3").text, 'media': item.find("div", class_="slp").find_all("span")[0].text,
  73. # 'date': item.find("div", class_="slp").find_all("span")[2].text,
  74. # 'desc': item.find("div", class_="st").text, 'link': link,'img':img
  75. # })
  76. except:
  77. print('exp')
  78. self.response.close()
  79. except Exception as e:
  80. traceback.print_exc()
  81. print(e)
  82. pass
  83. def get_news(self, deamplify=False):
  84. self.url = 'https://news.google.com/'
  85. try:
  86. # self.req = urllib.request.Request(self.url, headers=self.headers)
  87. # self.response = urllib.request.urlopen(self.req)
  88. self.response=requests.get(self.url)
  89. # print(self.response)
  90. self.page = self.response.text
  91. print(self.page)
  92. self.page = self.response.read()
  93. self.content = Soup(self.page, "html.parser")
  94. result = self.content.find_all("article")
  95. for item in result:
  96. try:
  97. title = item.find("h3").text
  98. if deamplify:
  99. try:
  100. link = item.find("a").get("jslog").split('2:')[1].split(';')[0]
  101. except Exception as e:
  102. print(e)
  103. link = item.find("h3").find("a").get("href")
  104. else:
  105. link = item.find("h3").find("a").get("href")
  106. self.texts.append(title)
  107. self.links.append(link)
  108. self.results.append(
  109. {'title': title,
  110. 'datetime': item.find("time").get("datetime"),
  111. 'time': item.find("time").text,
  112. 'desc': item.find("h3").next_sibling.text,
  113. 'link': link,
  114. 'media': None,
  115. 'img': item.previous_sibling.find("img").get("src")})
  116. except Exception as e:
  117. pass
  118. self.response.close()
  119. except Exception as e:
  120. print(e)
  121. pass
  122. def result(self):
  123. return self.results
  124. def gettext(self):
  125. return self.texts
  126. def getlinks(self):
  127. return self.links
  128. def clear(self):
  129. self.texts = []
  130. self.links = []
  131. self.results = []