opengraph.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. # encoding: utf-8
  2. import re
  3. try:
  4. import urllib.request, urllib.error, urllib.parse
  5. except ImportError:
  6. from urllib import request as urllib2
  7. try:
  8. from bs4 import BeautifulSoup
  9. except ImportError:
  10. from BeautifulSoup import BeautifulSoup
  11. global import_json
  12. try:
  13. import json
  14. import_json = True
  15. except ImportError:
  16. import_json = False
  17. class OpenGraph(dict):
  18. """
  19. """
  20. required_attrs = ['title', 'type', 'image', 'url', 'description']
  21. def __init__(self, url=None, html=None, scrape=False, **kwargs):
  22. # If scrape == True, then will try to fetch missing attribtues
  23. # from the page's body
  24. self.scrape = scrape
  25. self._url = url
  26. for k in list(kwargs.keys()):
  27. self[k] = kwargs[k]
  28. dict.__init__(self)
  29. if url is not None:
  30. self.fetch(url)
  31. if html is not None:
  32. self.parser(html)
  33. def __setattr__(self, name, val):
  34. self[name] = val
  35. def __getattr__(self, name):
  36. return self[name]
  37. def fetch(self, url):
  38. """
  39. """
  40. raw = urllib.request.urlopen(url)
  41. html = raw.read()
  42. return self.parser(html)
  43. def parser(self, html):
  44. """
  45. """
  46. if not isinstance(html,BeautifulSoup):
  47. doc = BeautifulSoup(html, "lxml")
  48. else:
  49. doc = html
  50. ogs = doc.html.head.findAll(property=re.compile(r'^og'))
  51. for og in ogs:
  52. if og.has_attr('content'):
  53. self[og['property'][3:]]=og['content']
  54. # Couldn't fetch all attrs from og tags, try scraping body
  55. if not self.is_valid() and self.scrape:
  56. for attr in self.required_attrs:
  57. if not self.valid_attr(attr):
  58. try:
  59. self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
  60. except AttributeError:
  61. pass
  62. def valid_attr(self, attr):
  63. return self.get(attr) and len(self[attr]) > 0
  64. def is_valid(self):
  65. return all([self.valid_attr(attr) for attr in self.required_attrs])
  66. def to_html(self):
  67. if not self.is_valid():
  68. return "<meta property=\"og:error\" content=\"og metadata is not valid\" />"
  69. meta = ""
  70. for key,value in self.items():
  71. meta += "\n<meta property=\"og:%s\" content=\"%s\" />" %(key, value)
  72. meta += "\n"
  73. return meta
  74. def to_json(self):
  75. # TODO: force unicode
  76. global import_json
  77. if not import_json:
  78. return "{'error':'there isn't json module'}"
  79. if not self.is_valid():
  80. return json.dumps({'error':'og metadata is not valid'})
  81. return json.dumps(self)
  82. def to_xml(self):
  83. pass
  84. def scrape_image(self, doc):
  85. images = [dict(img.attrs)['src']
  86. for img in doc.html.body.findAll('img')]
  87. if images:
  88. return images[0]
  89. return ''
  90. def scrape_title(self, doc):
  91. return doc.html.head.title.text
  92. def scrape_type(self, doc):
  93. return 'other'
  94. def scrape_url(self, doc):
  95. return self._url
  96. def scrape_description(self, doc):
  97. tag = doc.html.head.findAll('meta', attrs={"name":"description"})
  98. result = "".join([t['content'] for t in tag])
  99. return result