diff --git a/html_image_url_extractor/README.rst b/html_image_url_extractor/README.rst new file mode 100644 index 000000000..2f4ea18a6 --- /dev/null +++ b/html_image_url_extractor/README.rst @@ -0,0 +1,81 @@ +.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg + :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html + :alt: License: AGPL-3 + +========================== +Image URLs from HTML field +========================== + +This module includes a method that extracts image URLs from any chunk of HTML, +in appearing order. + +Usage +===== + +This module just adds a technical utility, but nothing for the end user. + +If you are a developer and need this utility for your module, see these +examples and read the docs inside the code. + +Python example:: + + @api.multi + def some_method(self): + # Get images from an HTML field + imgs = self.env["ir.fields.converter"].imgs_from_html(self.html_field) + for url in imgs: + # Do stuff with those URLs + pass + +QWeb example:: + + + + + + +.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas + :alt: Try me on Runbot + :target: https://runbot.odoo-community.org/runbot/149/8.0 + +Known issues / Roadmap +====================== + +* The regexp to find the URL could be better. + +Bug Tracker +=========== + +Bugs are tracked on `GitHub Issues +`_. In case of trouble, please +check there if your issue has already been reported. If you spotted it first, +help us smashing it by providing a detailed and welcomed `feedback +`_. + +Credits +======= + +Contributors +------------ + +* Jairo Llopis + +Maintainer +---------- + +.. image:: https://odoo-community.org/logo.png + :alt: Odoo Community Association + :target: https://odoo-community.org + +This module is maintained by the OCA. + +OCA, or the Odoo Community Association, is a nonprofit organization whose +mission is to support the collaborative development of Odoo features and +promote its widespread use. + +To contribute to this module, please visit https://odoo-community.org. diff --git a/html_image_url_extractor/__init__.py b/html_image_url_extractor/__init__.py new file mode 100644 index 000000000..09356048f --- /dev/null +++ b/html_image_url_extractor/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from . import models diff --git a/html_image_url_extractor/__openerp__.py b/html_image_url_extractor/__openerp__.py new file mode 100644 index 000000000..4932bf226 --- /dev/null +++ b/html_image_url_extractor/__openerp__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +{ + "name": "Image URLs from HTML field", + "summary": "Extract images found in any HTML field", + "version": "8.0.1.0.0", + "category": "Tools", + "website": "https://grupoesoc.es", + "author": "Grupo ESOC Ingeniería de Servicios, " + "Odoo Community Association (OCA)", + "license": "AGPL-3", + "application": False, + "installable": True, + "external_dependencies": { + "python": [ + "lxml.html", + ], + }, + "depends": [ + "base", + ], +} diff --git a/html_image_url_extractor/models/__init__.py b/html_image_url_extractor/models/__init__.py new file mode 100644 index 000000000..11b8b13d5 --- /dev/null +++ b/html_image_url_extractor/models/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from . import ir_fields_converter diff --git a/html_image_url_extractor/models/ir_fields_converter.py b/html_image_url_extractor/models/ir_fields_converter.py new file mode 100644 index 000000000..cc19aa7f8 --- /dev/null +++ b/html_image_url_extractor/models/ir_fields_converter.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import re +import logging +from lxml import etree, html +from openerp import api, models + +_logger = logging.getLogger(__name__) + + +class IrFieldsConverter(models.Model): + _inherit = "ir.fields.converter" + + @api.model + def imgs_from_html(self, html_content, limit=None, fail=False): + """Extract all images in order from an HTML field in a generator. + + :param str html_content: + HTML contents from where to extract the images. + + :param int limit: + Only get up to this number of images. + + :param bool fail: + If ``True``, exceptions will be raised. + """ + # Parse HTML + try: + doc = html.fromstring(html_content) + except (TypeError, etree.XMLSyntaxError, etree.ParserError): + if fail: + raise + else: + _logger.exception("Failure parsing this HTML:\n%s", + html_content) + return + + # Required tools + query = """ + //img[@src] | + //*[contains(translate(@style, "BACKGROUND", "background"), + 'background')] + [contains(translate(@style, "URL", "url"), 'url(')] + """ + rgx = r""" + url\(\s* # Start function + (?P[^)]*) # URL string + \s*\) # End function + """ + rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE) + + # Loop through possible image URLs + for lap, element in enumerate(doc.xpath(query)): + if limit and lap >= limit: + break + if element.tag == "img": + yield element.attrib["src"] + else: + for rule in element.attrib["style"].split(";"): + # Extract background image + parts = rule.split(":", 1) + try: + if parts[0].strip().lower() in {"background", + "background-image"}: + yield (rgx.search(parts[1]) + .group("url").strip("\"'")) + # Malformed CSS or no match for URL + except (IndexError, AttributeError): + pass diff --git a/html_image_url_extractor/static/description/icon.png b/html_image_url_extractor/static/description/icon.png new file mode 100644 index 000000000..3a0328b51 Binary files /dev/null and b/html_image_url_extractor/static/description/icon.png differ diff --git a/html_image_url_extractor/tests/__init__.py b/html_image_url_extractor/tests/__init__.py new file mode 100644 index 000000000..60346a281 --- /dev/null +++ b/html_image_url_extractor/tests/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from . import test_extractor diff --git a/html_image_url_extractor/tests/test_extractor.py b/html_image_url_extractor/tests/test_extractor.py new file mode 100644 index 000000000..c511aa5f8 --- /dev/null +++ b/html_image_url_extractor/tests/test_extractor.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from lxml import etree +from openerp.tests.common import TransactionCase + + +class ExtractorCase(TransactionCase): + def setUp(self): + super(ExtractorCase, self).setUp() + + # Shortcut + self.imgs_from_html = self.env["ir.fields.converter"].imgs_from_html + + def test_mixed_images_found(self): + """Images correctly found in elements and backgrounds.""" + content = u""" +
+ + +

+ + + + +

+
+

+ +

+
+
+
+ """ + + # Read all images + for n, url in enumerate(self.imgs_from_html(content)): + self.assertEqual("/path/%d" % n, url) + self.assertEqual(n, 7) + + # Read only first image + for n, url in enumerate(self.imgs_from_html(content, 1)): + self.assertEqual("/path/%d" % n, url) + self.assertEqual(n, 0) + + def test_empty_html(self): + """Empty HTML handled correctly.""" + for laps, text in self.imgs_from_html(""): + self.assertTrue(False) # You should never get here + + with self.assertRaises(etree.XMLSyntaxError): + list(self.imgs_from_html("", fail=True)) + + def test_false_html(self): + """``False`` HTML handled correctly.""" + for laps, text in self.imgs_from_html(False): + self.assertTrue(False) # You should never get here + + with self.assertRaises(TypeError): + list(self.imgs_from_html(False, fail=True)) + + def test_bad_html(self): + """Bad HTML handled correctly.""" + for laps, text in self.imgs_from_html("<"): + self.assertTrue(False) # You should never get here + + with self.assertRaises(etree.ParserError): + list(self.imgs_from_html("<", fail=True))