Merge pull request #625 from Tecnativa/9.0-mig-html_text

[9.0] [MIG] html_text module
8 years ago · 75143b41c3
8 changed files with 253 additions and 0 deletions
--- a/html_text/README.rst
+++ b/html_text/README.rst
@ -0,0 +1,79 @@
+.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
+   :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
+   :alt: License: AGPL-3
+
+====================
+Text from HTML field
+====================
+
+This module provides some technical features that allow to extract text from
+any chunk of HTML, without HTML tags or attributes. You can chose either:
+
+* To truncate the result by amount of words or characters.
+* To append an ellipsis (or any character(s)) at the end of the result.
+
+It can be used to easily generate excerpts.
+
+Usage
+=====
+
+This module just adds a technical utility, but nothing for the end user.
+
+If you are a developer and need this utility for your module, see these
+examples and read the docs inside the code.
+
+Python example::
+
+    @api.multi
+    def some_method(self):
+        # Get truncated text from an HTML field. It will 40 words and 100
+        # characters at most, and will have "..." appended at the end if it
+        # gets truncated.
+        truncated_text = self.env["ir.fields.converter"].text_from_html(
+            self.html_field, 40, 100, "...")
+
+QWeb example::
+
+    <t t-esc="env['ir.fields.converter'].text_from_html(doc.html_field)"/>
+
+.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
+   :alt: Try me on Runbot
+   :target: https://runbot.odoo-community.org/runbot/149/9.0
+
+Known issues / Roadmap
+======================
+
+* An option could be added to try to respect the basic HTML tags inside the
+  excerpt (``<b>``, ``<i>``, ``<p>``, etc.).
+
+Bug Tracker
+===========
+
+Bugs are tracked on `GitHub Issues
+<https://github.com/OCA/server-tools/issues>`_. In case of trouble, please
+check there if your issue has already been reported. If you spotted it first,
+help us smashing it by providing a detailed and welcomed feedback.
+
+Credits
+=======
+
+Contributors
+------------
+
+* Jairo Llopis <yajo.sk8@gmail.com>
+* Vicent Cubells <vicent.cubells@tecnativa.com>
+
+Maintainer
+----------
+
+.. image:: https://odoo-community.org/logo.png
+   :alt: Odoo Community Association
+   :target: https://odoo-community.org
+
+This module is maintained by the OCA.
+
+OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.
+
+To contribute to this module, please visit https://odoo-community.org.
--- a/html_text/init.py
+++ b/html_text/init.py
@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import models
--- a/html_text/openerp.py
+++ b/html_text/openerp.py
@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+{
+    "name": "Text from HTML field",
+    "summary": "Generate excerpts from any HTML field",
+    "version": "9.0.1.0.0",
+    "category": "Tools",
+    "website": "https://tecnativa.com",
+    "author": "Grupo ESOC Ingeniería de Servicios, "
+              "Tecnativa, "
+              "Odoo Community Association (OCA)",
+    "license": "AGPL-3",
+    "application": False,
+    "installable": True,
+    "external_dependencies": {
+        "python": [
+            "lxml.html",
+        ],
+    },
+    "depends": [
+        "base",
+    ],
+}
--- a/html_text/models/init.py
+++ b/html_text/models/init.py
@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import ir_fields_converter
--- a/html_text/models/ir_fields_converter.py
+++ b/html_text/models/ir_fields_converter.py
@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import logging
+from lxml import etree, html
+from openerp import api, models
+
+_logger = logging.getLogger(__name__)
+
+
+class IrFieldsConverter(models.Model):
+    _inherit = "ir.fields.converter"
+
+    @api.model
+    def text_from_html(self, html_content, max_words=None, max_chars=None,
+                       ellipsis=u"…", fail=False):
+        """Extract text from an HTML field in a generator.
+
+        :param str html_content:
+            HTML contents from where to extract the text.
+
+        :param int max_words:
+            Maximum amount of words allowed in the resulting string.
+
+        :param int max_chars:
+            Maximum amount of characters allowed in the resulting string. If
+            you apply this limit, beware that the last word could get cut in an
+            unexpected place.
+
+        :param str ellipsis:
+            Character(s) to be appended to the end of the resulting string if
+            it gets truncated after applying limits set in :param:`max_words`
+            or :param:`max_chars`. If you want nothing applied, just set an
+            empty string.
+
+        :param bool fail:
+            If ``True``, exceptions will be raised. Otherwise, an empty string
+            will be returned on failure.
+        """
+        # Parse HTML
+        try:
+            doc = html.fromstring(html_content)
+        except (TypeError, etree.XMLSyntaxError, etree.ParserError):
+            if fail:
+                raise
+            else:
+                _logger.exception("Failure parsing this HTML:\n%s",
+                                  html_content)
+                return ""
+
+        # Get words
+        words = u"".join(doc.xpath("//text()")).split()
+
+        # Truncate words
+        suffix = max_words and len(words) > max_words
+        if max_words:
+            words = words[:max_words]
+
+        # Get text
+        text = u" ".join(words)
+
+        # Truncate text
+        suffix = suffix or max_chars and len(text) > max_chars
+        if max_chars:
+            text = text[:max_chars - (len(ellipsis) if suffix else 0)].strip()
+
+        # Append ellipsis if needed
+        if suffix:
+            text += ellipsis
+
+        return text
--- a/html_text/static/description/icon.png
+++ b/html_text/static/description/icon.png
--- a/html_text/tests/init.py
+++ b/html_text/tests/init.py
@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import test_extractor
--- a/html_text/tests/test_extractor.py
+++ b/html_text/tests/test_extractor.py
@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from lxml import etree
+from openerp.tests.common import TransactionCase
+
+
+class ExtractorCase(TransactionCase):
+    def setUp(self):
+        super(ExtractorCase, self).setUp()
+
+        # Shortcut
+        self.text_from_html = self.env["ir.fields.converter"].text_from_html
+
+    def test_excerpts(self):
+        """Text gets correctly extracted."""
+        html = u"""
+            <html>
+                <body>
+                    <div class="this should not appear">
+                        <h1>I'm a title</h1>
+                        <p>I'm a paragraph</p>
+                        <small>¡Pues yo soy español!</small>
+                    </div>
+                </body>
+            </html>
+            """
+
+        self.assertEqual(
+            self.text_from_html(html),
+            u"I'm a title I'm a paragraph ¡Pues yo soy español!")
+        self.assertEqual(
+            self.text_from_html(html, 8),
+            u"I'm a title I'm a paragraph ¡Pues yo…")
+        self.assertEqual(
+            self.text_from_html(html, 8, 31),
+            u"I'm a title I'm a paragraph ¡P…")
+        self.assertEqual(
+            self.text_from_html(html, 7, ellipsis=""),
+            u"I'm a title I'm a paragraph ¡Pues")
+
+    def test_empty_html(self):
+        """Empty HTML handled correctly."""
+        self.assertEqual(self.text_from_html(""), "")
+        with self.assertRaises(etree.XMLSyntaxError):
+            self.text_from_html("", fail=True)
+
+    def test_false_html(self):
+        """``False`` HTML handled correctly."""
+        self.assertEqual(self.text_from_html(False), "")
+        with self.assertRaises(TypeError):
+            self.text_from_html(False, fail=True)
+
+    def test_bad_html(self):
+        """Bad HTML handled correctly."""
+        self.assertEqual(self.text_from_html("<<bad>"), "")
+        with self.assertRaises(etree.ParserError):
+            self.text_from_html("<<bad>", fail=True)