# encoding: utf-8 import re try: import urllib.request, urllib.error, urllib.parse except ImportError: from urllib import request as urllib2 try: from bs4 import BeautifulSoup except ImportError: from BeautifulSoup import BeautifulSoup global import_json try: import json import_json = True except ImportError: import_json = False class OpenGraph(dict): """ """ required_attrs = ['title', 'type', 'image', 'url', 'description'] def __init__(self, url=None, html=None, scrape=False, **kwargs): # If scrape == True, then will try to fetch missing attribtues # from the page's body self.scrape = scrape self._url = url for k in list(kwargs.keys()): self[k] = kwargs[k] dict.__init__(self) if url is not None: self.fetch(url) if html is not None: self.parser(html) def __setattr__(self, name, val): self[name] = val def __getattr__(self, name): return self[name] def fetch(self, url): """ """ raw = urllib.request.urlopen(url) html = raw.read() return self.parser(html) def parser(self, html): """ """ if not isinstance(html,BeautifulSoup): doc = BeautifulSoup(html, "lxml") else: doc = html ogs = doc.html.head.findAll(property=re.compile(r'^og')) for og in ogs: if og.has_attr('content'): self[og['property'][3:]]=og['content'] # Couldn't fetch all attrs from og tags, try scraping body if not self.is_valid() and self.scrape: for attr in self.required_attrs: if not self.valid_attr(attr): try: self[attr] = getattr(self, 'scrape_%s' % attr)(doc) except AttributeError: pass def valid_attr(self, attr): return self.get(attr) and len(self[attr]) > 0 def is_valid(self): return all([self.valid_attr(attr) for attr in self.required_attrs]) def to_html(self): if not self.is_valid(): return "" meta = "" for key,value in self.items(): meta += "\n" %(key, value) meta += "\n" return meta def to_json(self): # TODO: force unicode global import_json if not import_json: return "{'error':'there isn't json module'}" if not self.is_valid(): return json.dumps({'error':'og metadata is not valid'}) return json.dumps(self) def to_xml(self): pass def scrape_image(self, doc): images = [dict(img.attrs)['src'] for img in doc.html.body.findAll('img')] if images: return images[0] return '' def scrape_title(self, doc): return doc.html.head.title.text def scrape_type(self, doc): return 'other' def scrape_url(self, doc): return self._url def scrape_description(self, doc): tag = doc.html.head.findAll('meta', attrs={"name":"description"}) result = "".join([t['content'] for t in tag]) return result