Refactored class ``OriginalDocument``

11 years ago · c9e8e00b92
parent eb8a8c5248
commit c9e8e00b92
2 changed files with 31 additions and 18 deletions
--- a/readability/_py3k.py
+++ b/readability/_py3k.py
@ -24,6 +24,20 @@ except ImportError:
    import urllib.request as urllib


+def unicode_compatible(cls):
+    """
+    Decorator for unicode compatible classes. Method ``__unicode__``
+    has to be implemented to work decorator as expected.
+    """
+    if PY3:
+        cls.__str__ = cls.__unicode__
+        cls.__bytes__ = lambda self: self.__str__().encode("utf8")
+    else:
+        cls.__str__ = lambda self: self.__unicode__().encode("utf8")
+
+    return cls
+
+
 def to_string(object):
    return to_unicode(object) if PY3 else to_bytes(object)

--- a/readability/document.py
+++ b/readability/document.py
@ -11,7 +11,7 @@ import charade
 from lxml.etree import tostring, tounicode, XMLSyntaxError
 from lxml.html import document_fromstring, HTMLParser

-from ._py3k import unicode, to_string, to_bytes, to_unicode
+from ._py3k import unicode, to_bytes, to_unicode, unicode_compatible
 from .utils import cached_property


@ -72,45 +72,44 @@ def build_document(html_content, base_href=None):
    return document


+@unicode_compatible
 class OriginalDocument(object):
-    """The original document to process"""
+    """The original document to process."""

    def __init__(self, html, url=None):
-        self.orig_html = html
-        self.url = url
+        self._html = html
+        self._url = url

-    def __str__(self):
-        """Render out our document as a string"""
-        return to_string(tostring(self.html))
+    @property
+    def url(self):
+        """Source URL of HTML document."""
+        return self._url

    def __unicode__(self):
-        """Render out our document as a string"""
+        """Renders the document as a string."""
        return tounicode(self.html)

-    def _parse(self, html):
-        """Generate an lxml document from html."""
+    @cached_property
+    def html(self):
+        """Parsed HTML document from the input."""
+        html = self._html
        if not isinstance(html, unicode):
            encoding = determine_encoding(html)
            html = html.decode(encoding)

        html = replace_multi_br_to_paragraphs(html)
-        document = build_document(html, self.url)
+        document = build_document(html, self._url)

        return document

-    @cached_property
-    def html(self):
-        """The parsed html document from the input"""
-        return self._parse(self.orig_html)
-
    @cached_property
    def links(self):
-        """Links within the document"""
+        """Links within the document."""
        return self.html.findall(".//a")

    @cached_property
    def title(self):
-        """Pull the title attribute out of the parsed document"""
+        """Title attribute of the parsed document."""
        title_element = self.html.find(".//title")
        if title_element is None or title_element.text is None:
            return ""