Browse Source
[8.0][html_image_url_extractor] Image extractor from HTML fields. (#354)
[8.0][html_image_url_extractor] Image extractor from HTML fields. (#354)
* [8.0][html_image_url_extractor] Image extractor from HTML fields. This technical utility allows the developer to get a list of image URLs from any piece of HTML. You can use it for example, to get the cover image from a blog post (upcoming module), or to create a slider with all images from it.pull/418/merge
Yajo
9 years ago
committed by
Pedro M. Baeza
8 changed files with 259 additions and 0 deletions
-
81html_image_url_extractor/README.rst
-
5html_image_url_extractor/__init__.py
-
23html_image_url_extractor/__openerp__.py
-
5html_image_url_extractor/models/__init__.py
-
71html_image_url_extractor/models/ir_fields_converter.py
-
BINhtml_image_url_extractor/static/description/icon.png
-
5html_image_url_extractor/tests/__init__.py
-
69html_image_url_extractor/tests/test_extractor.py
@ -0,0 +1,81 @@ |
|||||
|
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg |
||||
|
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html |
||||
|
:alt: License: AGPL-3 |
||||
|
|
||||
|
========================== |
||||
|
Image URLs from HTML field |
||||
|
========================== |
||||
|
|
||||
|
This module includes a method that extracts image URLs from any chunk of HTML, |
||||
|
in appearing order. |
||||
|
|
||||
|
Usage |
||||
|
===== |
||||
|
|
||||
|
This module just adds a technical utility, but nothing for the end user. |
||||
|
|
||||
|
If you are a developer and need this utility for your module, see these |
||||
|
examples and read the docs inside the code. |
||||
|
|
||||
|
Python example:: |
||||
|
|
||||
|
@api.multi |
||||
|
def some_method(self): |
||||
|
# Get images from an HTML field |
||||
|
imgs = self.env["ir.fields.converter"].imgs_from_html(self.html_field) |
||||
|
for url in imgs: |
||||
|
# Do stuff with those URLs |
||||
|
pass |
||||
|
|
||||
|
QWeb example:: |
||||
|
|
||||
|
<!-- Extract first image from a blog post --> |
||||
|
<t t-foreach="env['ir.fields.converter'] |
||||
|
.imgs_from_html(blog_post.content, 1)" |
||||
|
t-as="url"> |
||||
|
<img t-att-href="url"/> |
||||
|
</t> |
||||
|
|
||||
|
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas |
||||
|
:alt: Try me on Runbot |
||||
|
:target: https://runbot.odoo-community.org/runbot/149/8.0 |
||||
|
|
||||
|
Known issues / Roadmap |
||||
|
====================== |
||||
|
|
||||
|
* The regexp to find the URL could be better. |
||||
|
|
||||
|
Bug Tracker |
||||
|
=========== |
||||
|
|
||||
|
Bugs are tracked on `GitHub Issues |
||||
|
<https://github.com/OCA/server-tools/issues>`_. In case of trouble, please |
||||
|
check there if your issue has already been reported. If you spotted it first, |
||||
|
help us smashing it by providing a detailed and welcomed `feedback |
||||
|
<https://github.com/OCA/ |
||||
|
server-tools/issues/new?body=module:%20 |
||||
|
html_image_url_extractor%0Aversion:%20 |
||||
|
8.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**>`_. |
||||
|
|
||||
|
Credits |
||||
|
======= |
||||
|
|
||||
|
Contributors |
||||
|
------------ |
||||
|
|
||||
|
* Jairo Llopis <yajo.sk8@gmail.com> |
||||
|
|
||||
|
Maintainer |
||||
|
---------- |
||||
|
|
||||
|
.. image:: https://odoo-community.org/logo.png |
||||
|
:alt: Odoo Community Association |
||||
|
:target: https://odoo-community.org |
||||
|
|
||||
|
This module is maintained by the OCA. |
||||
|
|
||||
|
OCA, or the Odoo Community Association, is a nonprofit organization whose |
||||
|
mission is to support the collaborative development of Odoo features and |
||||
|
promote its widespread use. |
||||
|
|
||||
|
To contribute to this module, please visit https://odoo-community.org. |
@ -0,0 +1,5 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
||||
|
|
||||
|
from . import models |
@ -0,0 +1,23 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
||||
|
{ |
||||
|
"name": "Image URLs from HTML field", |
||||
|
"summary": "Extract images found in any HTML field", |
||||
|
"version": "8.0.1.0.0", |
||||
|
"category": "Tools", |
||||
|
"website": "https://grupoesoc.es", |
||||
|
"author": "Grupo ESOC Ingeniería de Servicios, " |
||||
|
"Odoo Community Association (OCA)", |
||||
|
"license": "AGPL-3", |
||||
|
"application": False, |
||||
|
"installable": True, |
||||
|
"external_dependencies": { |
||||
|
"python": [ |
||||
|
"lxml.html", |
||||
|
], |
||||
|
}, |
||||
|
"depends": [ |
||||
|
"base", |
||||
|
], |
||||
|
} |
@ -0,0 +1,5 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
||||
|
|
||||
|
from . import ir_fields_converter |
@ -0,0 +1,71 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
||||
|
|
||||
|
import re |
||||
|
import logging |
||||
|
from lxml import etree, html |
||||
|
from openerp import api, models |
||||
|
|
||||
|
_logger = logging.getLogger(__name__) |
||||
|
|
||||
|
|
||||
|
class IrFieldsConverter(models.Model): |
||||
|
_inherit = "ir.fields.converter" |
||||
|
|
||||
|
@api.model |
||||
|
def imgs_from_html(self, html_content, limit=None, fail=False): |
||||
|
"""Extract all images in order from an HTML field in a generator. |
||||
|
|
||||
|
:param str html_content: |
||||
|
HTML contents from where to extract the images. |
||||
|
|
||||
|
:param int limit: |
||||
|
Only get up to this number of images. |
||||
|
|
||||
|
:param bool fail: |
||||
|
If ``True``, exceptions will be raised. |
||||
|
""" |
||||
|
# Parse HTML |
||||
|
try: |
||||
|
doc = html.fromstring(html_content) |
||||
|
except (TypeError, etree.XMLSyntaxError, etree.ParserError): |
||||
|
if fail: |
||||
|
raise |
||||
|
else: |
||||
|
_logger.exception("Failure parsing this HTML:\n%s", |
||||
|
html_content) |
||||
|
return |
||||
|
|
||||
|
# Required tools |
||||
|
query = """ |
||||
|
//img[@src] | |
||||
|
//*[contains(translate(@style, "BACKGROUND", "background"), |
||||
|
'background')] |
||||
|
[contains(translate(@style, "URL", "url"), 'url(')] |
||||
|
""" |
||||
|
rgx = r""" |
||||
|
url\(\s* # Start function |
||||
|
(?P<url>[^)]*) # URL string |
||||
|
\s*\) # End function |
||||
|
""" |
||||
|
rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE) |
||||
|
|
||||
|
# Loop through possible image URLs |
||||
|
for lap, element in enumerate(doc.xpath(query)): |
||||
|
if limit and lap >= limit: |
||||
|
break |
||||
|
if element.tag == "img": |
||||
|
yield element.attrib["src"] |
||||
|
else: |
||||
|
for rule in element.attrib["style"].split(";"): |
||||
|
# Extract background image |
||||
|
parts = rule.split(":", 1) |
||||
|
try: |
||||
|
if parts[0].strip().lower() in {"background", |
||||
|
"background-image"}: |
||||
|
yield (rgx.search(parts[1]) |
||||
|
.group("url").strip("\"'")) |
||||
|
# Malformed CSS or no match for URL |
||||
|
except (IndexError, AttributeError): |
||||
|
pass |
After Width: 128 | Height: 128 | Size: 9.2 KiB |
@ -0,0 +1,5 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
||||
|
|
||||
|
from . import test_extractor |
@ -0,0 +1,69 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
||||
|
|
||||
|
from lxml import etree |
||||
|
from openerp.tests.common import TransactionCase |
||||
|
|
||||
|
|
||||
|
class ExtractorCase(TransactionCase): |
||||
|
def setUp(self): |
||||
|
super(ExtractorCase, self).setUp() |
||||
|
|
||||
|
# Shortcut |
||||
|
self.imgs_from_html = self.env["ir.fields.converter"].imgs_from_html |
||||
|
|
||||
|
def test_mixed_images_found(self): |
||||
|
"""Images correctly found in <img> elements and backgrounds.""" |
||||
|
content = u""" |
||||
|
<div> |
||||
|
<!-- src-less img --> |
||||
|
<img/> |
||||
|
<p/> |
||||
|
<img src="/path/0"/> |
||||
|
<img src="/path/1"/> |
||||
|
<img src="/path/2"/> |
||||
|
<img src="/path/3"/> |
||||
|
<section style="background : URL('/path/4');;background;ö;"> |
||||
|
<div style='BACKGROUND-IMAGE:url(/path/5)'> |
||||
|
<p style="background:uRl("/path/6")"> |
||||
|
<img src="/path/7"/> |
||||
|
</p> |
||||
|
</div> |
||||
|
</section> |
||||
|
</div> |
||||
|
""" |
||||
|
|
||||
|
# Read all images |
||||
|
for n, url in enumerate(self.imgs_from_html(content)): |
||||
|
self.assertEqual("/path/%d" % n, url) |
||||
|
self.assertEqual(n, 7) |
||||
|
|
||||
|
# Read only first image |
||||
|
for n, url in enumerate(self.imgs_from_html(content, 1)): |
||||
|
self.assertEqual("/path/%d" % n, url) |
||||
|
self.assertEqual(n, 0) |
||||
|
|
||||
|
def test_empty_html(self): |
||||
|
"""Empty HTML handled correctly.""" |
||||
|
for laps, text in self.imgs_from_html(""): |
||||
|
self.assertTrue(False) # You should never get here |
||||
|
|
||||
|
with self.assertRaises(etree.XMLSyntaxError): |
||||
|
list(self.imgs_from_html("", fail=True)) |
||||
|
|
||||
|
def test_false_html(self): |
||||
|
"""``False`` HTML handled correctly.""" |
||||
|
for laps, text in self.imgs_from_html(False): |
||||
|
self.assertTrue(False) # You should never get here |
||||
|
|
||||
|
with self.assertRaises(TypeError): |
||||
|
list(self.imgs_from_html(False, fail=True)) |
||||
|
|
||||
|
def test_bad_html(self): |
||||
|
"""Bad HTML handled correctly.""" |
||||
|
for laps, text in self.imgs_from_html("<<bad>"): |
||||
|
self.assertTrue(False) # You should never get here |
||||
|
|
||||
|
with self.assertRaises(etree.ParserError): |
||||
|
list(self.imgs_from_html("<<bad>", fail=True)) |
Write
Preview
Loading…
Cancel
Save
Reference in new issue