gn3.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. ### MODULES
  2. import re
  3. import urllib.request
  4. import dateparser, copy
  5. from bs4 import BeautifulSoup as Soup, ResultSet
  6. from dateutil.parser import parse
  7. import socks
  8. import ssl
  9. import socket
  10. import datetime
  11. from dateutil.relativedelta import relativedelta
  12. ### METHODS
  13. def lexical_date_parser(date_to_check):
  14. if date_to_check=='':
  15. return ('',None)
  16. datetime_tmp=None
  17. date_tmp=copy.copy(date_to_check)
  18. count=0
  19. while datetime_tmp==None and count <= (len(date_to_check)-3):
  20. datetime_tmp=dateparser.parse(date_tmp)
  21. if datetime_tmp==None:
  22. date_tmp=date_tmp[1:]
  23. count+=1
  24. if datetime_tmp==None:
  25. date_tmp=date_to_check
  26. else:
  27. datetime_tmp=datetime_tmp.replace(tzinfo=None)
  28. if date_tmp[0]==' ':
  29. date_tmp=date_tmp[1:]
  30. return date_tmp,datetime_tmp
  31. def define_date(date):
  32. months = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
  33. try:
  34. if ' ago' in date.lower():
  35. q = int(date.split()[-3])
  36. if 'hour' in date.lower():
  37. return datetime.datetime.now() + relativedelta(hours=-q)
  38. elif 'day' in date.lower():
  39. return datetime.datetime.now() + relativedelta(days=-q)
  40. elif 'week' in date.lower():
  41. return datetime.datetime.now() + relativedelta(days=-7*q)
  42. elif 'month' in date.lower():
  43. return datetime.datetime.now() + relativedelta(months=-q)
  44. else:
  45. for month in months.keys():
  46. if month.lower()+' ' in date.lower():
  47. date_list = date.replace(',','').split()[-3:]
  48. return datetime.datetime(day=int(date_list[1]), month=months[month], year=int(date_list[2]))
  49. except:
  50. return float('nan')
  51. ### CLASSEs
  52. class GoogleNews3:
  53. def __init__(self,lang="zh-tw",period="",start="",end="",encode="utf-8",region='tw'):
  54. self.__texts = []
  55. self.__links = []
  56. self.__results = []
  57. self.__totalcount = 0
  58. self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
  59. self.__lang = lang
  60. if region:
  61. self.accept_language= lang + '-' + region + ',' + lang + ';q=0.9'
  62. self.headers = {'User-Agent': self.user_agent, 'Accept-Language': self.accept_language}
  63. else:
  64. self.headers = {'User-Agent': self.user_agent}
  65. self.__period = period
  66. self.__start = start
  67. self.__end = end
  68. self.__encode = encode
  69. def set_lang(self, lang):
  70. self.__lang = lang
  71. def setlang(self, lang):
  72. """Don't remove this, will affect old version user when upgrade"""
  73. self.set_lang(lang)
  74. def set_period(self, period):
  75. self.__period = period
  76. def setperiod(self, period):
  77. """Don't remove this, will affect old version user when upgrade"""
  78. self.set_period(period)
  79. def set_time_range(self, start, end):
  80. self.__start = start
  81. self.__end = end
  82. def setTimeRange(self, start, end):
  83. """Don't remove this, will affect old version user when upgrade"""
  84. self.set_time_range(start, end)
  85. def set_encode(self, encode):
  86. self.__encode = encode
  87. def setencode(self, encode):
  88. """Don't remove this, will affect old version user when upgrade"""
  89. self.set_encode(encode)
  90. def search(self, key):
  91. """
  92. Searches for a term in google.com in the news section and retrieves the first page into __results.
  93. Parameters:
  94. key = the search term
  95. """
  96. self.__key = "+".join(key.split(" "))
  97. if self.__encode != "":
  98. self.__key = urllib.request.quote(self.__key.encode(self.__encode))
  99. self.get_page()
  100. def build_response(self):
  101. self.req = urllib.request.Request(self.url.replace("search?","search?hl=zh-tw&gl=tw&"), headers=self.headers)
  102. ctx = ssl.create_default_context()
  103. ctx.check_hostname = False
  104. ctx.verify_mode = ssl.CERT_NONE
  105. socks.set_default_proxy(socks.SOCKS5, '172.104.67.159', 8180)
  106. socket.socket = socks.socksocket
  107. self.response = urllib.request.urlopen(self.req)
  108. self.page = self.response.read().decode('utf-8')
  109. self.content = Soup(self.page, "html.parser")
  110. stats = self.content.find_all("div", id="result-stats")
  111. if stats and isinstance(stats, ResultSet):
  112. stats = re.search(r'[\d,]+', stats[0].text)
  113. self.__totalcount = int(stats.group().replace(',', ''))
  114. else:
  115. #TODO might want to add output for user to know no data was found
  116. return
  117. result = self.content.find_all("div", id="search")[0].find_all("g-card")
  118. return result
  119. def page_at(self, page=1):
  120. """
  121. Retrieves a specific page from google.com in the news sections into __results.
  122. Parameter:
  123. page = number of the page to be retrieved
  124. """
  125. results = []
  126. try:
  127. if self.__start != "" and self.__end != "":
  128. self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1)))
  129. elif self.__period != "":
  130. self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1)))
  131. else:
  132. self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1)))
  133. except AttributeError:
  134. raise AttributeError("You need to run a search() before using get_page().")
  135. try:
  136. result = self.build_response()
  137. for item in result:
  138. try:
  139. tmp_text = item.find("div", {"role" : "heading"}).text.replace("\n","")
  140. except Exception:
  141. tmp_text = ''
  142. try:
  143. tmp_link = item.find("a").get("href")
  144. except Exception:
  145. tmp_link = ''
  146. try:
  147. tmp_media = item.findAll("g-img")[1].parent.text
  148. except Exception:
  149. tmp_media = ''
  150. try:
  151. tmp_date = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').findNext('div').text
  152. tmp_date,tmp_datetime=lexical_date_parser(tmp_date)
  153. except Exception:
  154. tmp_date = ''
  155. tmp_datetime=None
  156. try:
  157. tmp_desc = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').text.replace("\n","")
  158. except Exception:
  159. tmp_desc = ''
  160. try:
  161. tmp_img = item.findAll("g-img")[0].find("img").get("src")
  162. except Exception:
  163. tmp_img = ''
  164. self.__texts.append(tmp_text)
  165. self.__links.append(tmp_link)
  166. results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
  167. self.response.close()
  168. except Exception as e_parser:
  169. print(e_parser)
  170. pass
  171. return results
  172. def get_page(self, page=1):
  173. """
  174. Retrieves a specific page from google.com in the news sections into __results.
  175. Parameter:
  176. page = number of the page to be retrieved
  177. """
  178. try:
  179. if self.__start != "" and self.__end != "":
  180. self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1)))
  181. elif self.__period != "":
  182. self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1)))
  183. else:
  184. self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1)))
  185. except AttributeError:
  186. raise AttributeError("You need to run a search() before using get_page().")
  187. try:
  188. result = self.build_response()
  189. for item in result:
  190. try:
  191. tmp_text = item.find("div", {"role" : "heading"}).text.replace("\n","")
  192. except Exception:
  193. tmp_text = ''
  194. try:
  195. tmp_link = item.find("a").get("href")
  196. except Exception:
  197. tmp_link = ''
  198. try:
  199. tmp_media = item.findAll("g-img")[1].parent.text
  200. except Exception:
  201. tmp_media = ''
  202. try:
  203. tmp_date = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').findNext('div').text
  204. tmp_date,tmp_datetime=lexical_date_parser(tmp_date)
  205. except Exception:
  206. tmp_date = ''
  207. tmp_datetime=None
  208. try:
  209. tmp_desc = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').text.replace("\n","")
  210. except Exception:
  211. tmp_desc = ''
  212. try:
  213. tmp_img = item.findAll("g-img")[0].find("img").get("src")
  214. except Exception:
  215. tmp_img = ''
  216. self.__texts.append(tmp_text)
  217. self.__links.append(tmp_link)
  218. self.__results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
  219. self.response.close()
  220. except Exception as e_parser:
  221. print(e_parser)
  222. pass
  223. def getpage(self, page=1):
  224. """Don't remove this, will affect old version user when upgrade"""
  225. self.get_page(page)
  226. def get_news(self, key="",deamplify=False):
  227. if key != '':
  228. key = "+".join(key.split(" "))
  229. if self.__encode != "":
  230. key = urllib.request.quote(key.encode(self.__encode))
  231. self.url = 'https://news.google.com/search?q={}+when:{}&hl={}'.format(key,self.__period,self.__lang.lower())
  232. else:
  233. self.url = 'https://news.google.com/?hl={}'.format(self.__lang)
  234. try:
  235. self.req = urllib.request.Request(self.url, headers=self.headers)
  236. #print(self.url)
  237. ctx = ssl.create_default_context()
  238. ctx.check_hostname = False
  239. ctx.verify_mode = ssl.CERT_NONE
  240. socks.set_default_proxy(socks.SOCKS5, '172.104.67.159', 8180)
  241. socket.socket = socks.socksocket
  242. self.response = urllib.request.urlopen(self.req)
  243. self.page = self.response.read().decode('utf-8')
  244. self.content = Soup(self.page, "html.parser")
  245. articles = self.content.select('div[class="NiLAwe y6IFtc R7GTQ keNKEd j7vNaf nID9nc"]')
  246. for article in articles:
  247. try:
  248. # title
  249. try:
  250. title=article.find('h3').text
  251. except:
  252. title=None
  253. # description
  254. try:
  255. desc=article.find('span').text
  256. except:
  257. desc=None
  258. # date
  259. try:
  260. date = article.find("time").text
  261. # date,datetime_tmp = lexial_date_parser(date)
  262. except:
  263. date = None
  264. # datetime
  265. try:
  266. datetime_chars=article.find('time').get('datetime')
  267. datetime_obj = parse(datetime_chars).replace(tzinfo=None)
  268. except:
  269. datetime_obj=None
  270. # link
  271. if deamplify:
  272. try:
  273. link = 'news.google.com/' + article.find("h3").find("a").get("href")
  274. except Exception as deamp_e:
  275. print(deamp_e)
  276. link = article.find("article").get("jslog").split('2:')[1].split(';')[0]
  277. else:
  278. link = 'news.google.com/' + article.find("h3").find("a").get("href")
  279. self.__texts.append(title)
  280. self.__links.append(link)
  281. if link.startswith('https://www.youtube.com/watch?v='):
  282. desc = 'video'
  283. # image
  284. try:
  285. img = article.find("img").get("src")
  286. except:
  287. img = None
  288. # site
  289. try:
  290. site=article.find("time").parent.find("a").text
  291. except:
  292. site=None
  293. # collection
  294. self.__results.append({'title':title,
  295. 'desc':desc,
  296. 'date':date,
  297. 'datetime':define_date(date),
  298. 'link':link,
  299. 'img':img,
  300. 'media':None,
  301. 'site':site})
  302. except Exception as e_article:
  303. print(e_article)
  304. self.response.close()
  305. except Exception as e_parser:
  306. print(e_parser)
  307. pass
  308. #print(self.__results[0])
  309. def total_count(self):
  310. return self.__totalcount
  311. def result(self,sort=False):
  312. """Don't remove this, will affect old version user when upgrade"""
  313. return self.results(sort)
  314. def results(self,sort=False):
  315. """Returns the __results.
  316. New feature: include datatime and sort the articles in decreasing order"""
  317. results=self.__results
  318. if sort:
  319. try:
  320. results.sort(key = lambda x:x['datetime'],reverse=True)
  321. except Exception as e_sort:
  322. print(e_sort)
  323. results=self.__results
  324. return results
  325. def get_texts(self):
  326. """Returns only the __texts of the __results."""
  327. return self.__texts
  328. def gettext(self):
  329. """Don't remove this, will affect old version user when upgrade"""
  330. return self.get_texts()
  331. def get_links(self):
  332. """Returns only the __links of the __results."""
  333. return self.__links
  334. def clear(self):
  335. self.__texts = []
  336. self.__links = []
  337. self.__results = []
  338. self.__totalcount = 0