123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- # encoding: utf-8
- import re
- try:
- import urllib.request, urllib.error, urllib.parse
- except ImportError:
- from urllib import request as urllib2
- try:
- from bs4 import BeautifulSoup
- except ImportError:
- from BeautifulSoup import BeautifulSoup
- global import_json
- try:
- import json
- import_json = True
- except ImportError:
- import_json = False
- class OpenGraph(dict):
- """
- """
- required_attrs = ['title', 'type', 'image', 'url', 'description']
- def __init__(self, url=None, html=None, scrape=False, **kwargs):
- # If scrape == True, then will try to fetch missing attribtues
- # from the page's body
- self.scrape = scrape
- self._url = url
- for k in list(kwargs.keys()):
- self[k] = kwargs[k]
- dict.__init__(self)
- if url is not None:
- self.fetch(url)
- if html is not None:
- self.parser(html)
- def __setattr__(self, name, val):
- self[name] = val
- def __getattr__(self, name):
- return self[name]
- def fetch(self, url):
- """
- """
- raw = urllib.request.urlopen(url)
- html = raw.read()
- return self.parser(html)
- def parser(self, html):
- """
- """
- if not isinstance(html,BeautifulSoup):
- doc = BeautifulSoup(html, "lxml")
- else:
- doc = html
- ogs = doc.html.head.findAll(property=re.compile(r'^og'))
- for og in ogs:
- if og.has_attr('content'):
- self[og['property'][3:]]=og['content']
- # Couldn't fetch all attrs from og tags, try scraping body
- if not self.is_valid() and self.scrape:
- for attr in self.required_attrs:
- if not self.valid_attr(attr):
- try:
- self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
- except AttributeError:
- pass
- def valid_attr(self, attr):
- return self.get(attr) and len(self[attr]) > 0
- def is_valid(self):
- return all([self.valid_attr(attr) for attr in self.required_attrs])
- def to_html(self):
- if not self.is_valid():
- return "<meta property=\"og:error\" content=\"og metadata is not valid\" />"
- meta = ""
- for key,value in self.items():
- meta += "\n<meta property=\"og:%s\" content=\"%s\" />" %(key, value)
- meta += "\n"
- return meta
- def to_json(self):
- # TODO: force unicode
- global import_json
- if not import_json:
- return "{'error':'there isn't json module'}"
- if not self.is_valid():
- return json.dumps({'error':'og metadata is not valid'})
- return json.dumps(self)
- def to_xml(self):
- pass
- def scrape_image(self, doc):
- images = [dict(img.attrs)['src']
- for img in doc.html.body.findAll('img')]
- if images:
- return images[0]
- return ''
- def scrape_title(self, doc):
- return doc.html.head.title.text
- def scrape_type(self, doc):
- return 'other'
- def scrape_url(self, doc):
- return self._url
- def scrape_description(self, doc):
- tag = doc.html.head.findAll('meta', attrs={"name":"description"})
- result = "".join([t['content'] for t in tag])
- return result
|