[8.0][html_image_url_extractor] Image extractor from HTML fields. (#354)

* [8.0][html_image_url_extractor] Image extractor from HTML fields. This technical utility allows the developer to get a list of image URLs from any piece of HTML. You can use it for example, to get the cover image from a blog post (upcoming module), or to create a slider with all images from it.
9 years ago · e518a89260
8 changed files with 259 additions and 0 deletions
--- a/html_image_url_extractor/README.rst
+++ b/html_image_url_extractor/README.rst
@ -0,0 +1,81 @@
+.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
+   :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
+   :alt: License: AGPL-3
+
+==========================
+Image URLs from HTML field
+==========================
+
+This module includes a method that extracts image URLs from any chunk of HTML,
+in appearing order.
+
+Usage
+=====
+
+This module just adds a technical utility, but nothing for the end user.
+
+If you are a developer and need this utility for your module, see these
+examples and read the docs inside the code.
+
+Python example::
+
+    @api.multi
+    def some_method(self):
+        # Get images from an HTML field
+        imgs = self.env["ir.fields.converter"].imgs_from_html(self.html_field)
+        for url in imgs:
+            # Do stuff with those URLs
+            pass
+
+QWeb example::
+
+    <!-- Extract first image from a blog post -->
+    <t t-foreach="env['ir.fields.converter']
+                  .imgs_from_html(blog_post.content, 1)"
+       t-as="url">
+        <img t-att-href="url"/>
+    </t>
+
+.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
+   :alt: Try me on Runbot
+   :target: https://runbot.odoo-community.org/runbot/149/8.0
+
+Known issues / Roadmap
+======================
+
+* The regexp to find the URL could be better.
+
+Bug Tracker
+===========
+
+Bugs are tracked on `GitHub Issues
+<https://github.com/OCA/server-tools/issues>`_. In case of trouble, please
+check there if your issue has already been reported. If you spotted it first,
+help us smashing it by providing a detailed and welcomed `feedback
+<https://github.com/OCA/
+server-tools/issues/new?body=module:%20
+html_image_url_extractor%0Aversion:%20
+8.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**>`_.
+
+Credits
+=======
+
+Contributors
+------------
+
+* Jairo Llopis <yajo.sk8@gmail.com>
+
+Maintainer
+----------
+
+.. image:: https://odoo-community.org/logo.png
+   :alt: Odoo Community Association
+   :target: https://odoo-community.org
+
+This module is maintained by the OCA.
+
+OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.
+
+To contribute to this module, please visit https://odoo-community.org.
--- a/html_image_url_extractor/init.py
+++ b/html_image_url_extractor/init.py
@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import models
--- a/html_image_url_extractor/openerp.py
+++ b/html_image_url_extractor/openerp.py
@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+{
+    "name": "Image URLs from HTML field",
+    "summary": "Extract images found in any HTML field",
+    "version": "8.0.1.0.0",
+    "category": "Tools",
+    "website": "https://grupoesoc.es",
+    "author": "Grupo ESOC Ingeniería de Servicios, "
+              "Odoo Community Association (OCA)",
+    "license": "AGPL-3",
+    "application": False,
+    "installable": True,
+    "external_dependencies": {
+        "python": [
+            "lxml.html",
+        ],
+    },
+    "depends": [
+        "base",
+    ],
+}
--- a/html_image_url_extractor/models/init.py
+++ b/html_image_url_extractor/models/init.py
@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import ir_fields_converter
--- a/html_image_url_extractor/models/ir_fields_converter.py
+++ b/html_image_url_extractor/models/ir_fields_converter.py
@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import re
+import logging
+from lxml import etree, html
+from openerp import api, models
+
+_logger = logging.getLogger(__name__)
+
+
+class IrFieldsConverter(models.Model):
+    _inherit = "ir.fields.converter"
+
+    @api.model
+    def imgs_from_html(self, html_content, limit=None, fail=False):
+        """Extract all images in order from an HTML field in a generator.
+
+        :param str html_content:
+            HTML contents from where to extract the images.
+
+        :param int limit:
+            Only get up to this number of images.
+
+        :param bool fail:
+            If ``True``, exceptions will be raised.
+        """
+        # Parse HTML
+        try:
+            doc = html.fromstring(html_content)
+        except (TypeError, etree.XMLSyntaxError, etree.ParserError):
+            if fail:
+                raise
+            else:
+                _logger.exception("Failure parsing this HTML:\n%s",
+                                  html_content)
+                return
+
+        # Required tools
+        query = """
+            //img[@src] |
+            //*[contains(translate(@style, "BACKGROUND", "background"),
+                         'background')]
+               [contains(translate(@style, "URL", "url"), 'url(')]
+        """
+        rgx = r"""
+            url\(\s*        # Start function
+            (?P<url>[^)]*)  # URL string
+            \s*\)           # End function
+        """
+        rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE)
+
+        # Loop through possible image URLs
+        for lap, element in enumerate(doc.xpath(query)):
+            if limit and lap >= limit:
+                break
+            if element.tag == "img":
+                yield element.attrib["src"]
+            else:
+                for rule in element.attrib["style"].split(";"):
+                    # Extract background image
+                    parts = rule.split(":", 1)
+                    try:
+                        if parts[0].strip().lower() in {"background",
+                                                        "background-image"}:
+                            yield (rgx.search(parts[1])
+                                   .group("url").strip("\"'"))
+                    # Malformed CSS or no match for URL
+                    except (IndexError, AttributeError):
+                        pass
--- a/html_image_url_extractor/static/description/icon.png
+++ b/html_image_url_extractor/static/description/icon.png
--- a/html_image_url_extractor/tests/init.py
+++ b/html_image_url_extractor/tests/init.py
@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import test_extractor
--- a/html_image_url_extractor/tests/test_extractor.py
+++ b/html_image_url_extractor/tests/test_extractor.py
@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from lxml import etree
+from openerp.tests.common import TransactionCase
+
+
+class ExtractorCase(TransactionCase):
+    def setUp(self):
+        super(ExtractorCase, self).setUp()
+
+        # Shortcut
+        self.imgs_from_html = self.env["ir.fields.converter"].imgs_from_html
+
+    def test_mixed_images_found(self):
+        """Images correctly found in <img> elements and backgrounds."""
+        content = u"""
+            <div>
+                <!-- src-less img -->
+                <img/>
+                <p/>
+                <img src="/path/0"/>
+                <img src="/path/1"/>
+                <img src="/path/2"/>
+                <img src="/path/3"/>
+                <section style="background : URL('/path/4');;background;ö;">
+                    <div style='BACKGROUND-IMAGE:url(/path/5)'>
+                        <p style="background:uRl(&quot;/path/6&quot;)">
+                            <img src="/path/7"/>
+                        </p>
+                    </div>
+                </section>
+            </div>
+            """
+
+        # Read all images
+        for n, url in enumerate(self.imgs_from_html(content)):
+            self.assertEqual("/path/%d" % n, url)
+        self.assertEqual(n, 7)
+
+        # Read only first image
+        for n, url in enumerate(self.imgs_from_html(content, 1)):
+            self.assertEqual("/path/%d" % n, url)
+        self.assertEqual(n, 0)
+
+    def test_empty_html(self):
+        """Empty HTML handled correctly."""
+        for laps, text in self.imgs_from_html(""):
+            self.assertTrue(False)  # You should never get here
+
+        with self.assertRaises(etree.XMLSyntaxError):
+            list(self.imgs_from_html("", fail=True))
+
+    def test_false_html(self):
+        """``False`` HTML handled correctly."""
+        for laps, text in self.imgs_from_html(False):
+            self.assertTrue(False)  # You should never get here
+
+        with self.assertRaises(TypeError):
+            list(self.imgs_from_html(False, fail=True))
+
+    def test_bad_html(self):
+        """Bad HTML handled correctly."""
+        for laps, text in self.imgs_from_html("<<bad>"):
+            self.assertTrue(False)  # You should never get here
+
+        with self.assertRaises(etree.ParserError):
+            list(self.imgs_from_html("<<bad>", fail=True))