Browse Source
html_image_url_extractor module (#624)
html_image_url_extractor module (#624)
* Image extractor from HTML fields. (#354) * [8.0][html_image_url_extractor] Image extractor from HTML fields. This technical utility allows the developer to get a list of image URLs from any piece of HTML. You can use it for example, to get the cover image from a blog post (upcoming module), or to create a slider with all images from it. * [9.0] [MIG] html_image_url_extractor * Updated README.rstpull/1155/head
cubells
8 years ago
committed by
tarteo
8 changed files with 260 additions and 0 deletions
-
78html_image_url_extractor/README.rst
-
6html_image_url_extractor/__init__.py
-
24html_image_url_extractor/__openerp__.py
-
6html_image_url_extractor/models/__init__.py
-
72html_image_url_extractor/models/ir_fields_converter.py
-
BINhtml_image_url_extractor/static/description/icon.png
-
5html_image_url_extractor/tests/__init__.py
-
69html_image_url_extractor/tests/test_extractor.py
@ -0,0 +1,78 @@ |
|||
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg |
|||
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html |
|||
:alt: License: AGPL-3 |
|||
|
|||
========================== |
|||
Image URLs from HTML field |
|||
========================== |
|||
|
|||
This module includes a method that extracts image URLs from any chunk of HTML, |
|||
in appearing order. |
|||
|
|||
Usage |
|||
===== |
|||
|
|||
This module just adds a technical utility, but nothing for the end user. |
|||
|
|||
If you are a developer and need this utility for your module, see these |
|||
examples and read the docs inside the code. |
|||
|
|||
Python example:: |
|||
|
|||
@api.multi |
|||
def some_method(self): |
|||
# Get images from an HTML field |
|||
imgs = self.env["ir.fields.converter"].imgs_from_html(self.html_field) |
|||
for url in imgs: |
|||
# Do stuff with those URLs |
|||
pass |
|||
|
|||
QWeb example:: |
|||
|
|||
<!-- Extract first image from a blog post --> |
|||
<t t-foreach="env['ir.fields.converter'] |
|||
.imgs_from_html(blog_post.content, 1)" |
|||
t-as="url"> |
|||
<img t-att-href="url"/> |
|||
</t> |
|||
|
|||
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas |
|||
:alt: Try me on Runbot |
|||
:target: https://runbot.odoo-community.org/runbot/149/9.0 |
|||
|
|||
Known issues / Roadmap |
|||
====================== |
|||
|
|||
* The regexp to find the URL could be better. |
|||
|
|||
Bug Tracker |
|||
=========== |
|||
|
|||
Bugs are tracked on `GitHub Issues |
|||
<https://github.com/OCA/server-tools/issues>`_. In case of trouble, please |
|||
check there if your issue has already been reported. If you spotted it first, |
|||
help us smashing it by providing a detailed and welcomed feedback. |
|||
|
|||
Credits |
|||
======= |
|||
|
|||
Contributors |
|||
------------ |
|||
|
|||
* Jairo Llopis <yajo.sk8@gmail.com> |
|||
* Vicent Cubells <vicent.cubells@tecnativa.com> |
|||
|
|||
Maintainer |
|||
---------- |
|||
|
|||
.. image:: https://odoo-community.org/logo.png |
|||
:alt: Odoo Community Association |
|||
:target: https://odoo-community.org |
|||
|
|||
This module is maintained by the OCA. |
|||
|
|||
OCA, or the Odoo Community Association, is a nonprofit organization whose |
|||
mission is to support the collaborative development of Odoo features and |
|||
promote its widespread use. |
|||
|
|||
To contribute to this module, please visit https://odoo-community.org. |
@ -0,0 +1,6 @@ |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
|||
# Copyright 2016 Tecnativa - Vicent Cubells |
|||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
|||
|
|||
from . import models |
@ -0,0 +1,24 @@ |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
|||
# Copyright 2016 Tecnativa - Vicent Cubells |
|||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
|||
{ |
|||
"name": "Image URLs from HTML field", |
|||
"summary": "Extract images found in any HTML field", |
|||
"version": "9.0.1.0.0", |
|||
"category": "Tools", |
|||
"website": "https://tecnativa.com", |
|||
"author": "Tecnativa, " |
|||
"Odoo Community Association (OCA)", |
|||
"license": "AGPL-3", |
|||
"application": False, |
|||
"installable": True, |
|||
"external_dependencies": { |
|||
"python": [ |
|||
"lxml.html", |
|||
], |
|||
}, |
|||
"depends": [ |
|||
"base", |
|||
], |
|||
} |
@ -0,0 +1,6 @@ |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
|||
# Copyright 2016 Tecnativa - Vicent Cubells |
|||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
|||
|
|||
from . import ir_fields_converter |
@ -0,0 +1,72 @@ |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
|||
# Copyright 2016 Tecnativa - Vicent Cubells |
|||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
|||
|
|||
import re |
|||
import logging |
|||
from lxml import etree, html |
|||
from openerp import api, models |
|||
|
|||
_logger = logging.getLogger(__name__) |
|||
|
|||
|
|||
class IrFieldsConverter(models.Model): |
|||
_inherit = "ir.fields.converter" |
|||
|
|||
@api.model |
|||
def imgs_from_html(self, html_content, limit=None, fail=False): |
|||
"""Extract all images in order from an HTML field in a generator. |
|||
|
|||
:param str html_content: |
|||
HTML contents from where to extract the images. |
|||
|
|||
:param int limit: |
|||
Only get up to this number of images. |
|||
|
|||
:param bool fail: |
|||
If ``True``, exceptions will be raised. |
|||
""" |
|||
# Parse HTML |
|||
try: |
|||
doc = html.fromstring(html_content) |
|||
except (TypeError, etree.XMLSyntaxError, etree.ParserError): |
|||
if fail: |
|||
raise |
|||
else: |
|||
_logger.exception("Failure parsing this HTML:\n%s", |
|||
html_content) |
|||
return |
|||
|
|||
# Required tools |
|||
query = """ |
|||
//img[@src] | |
|||
//*[contains(translate(@style, "BACKGROUND", "background"), |
|||
'background')] |
|||
[contains(translate(@style, "URL", "url"), 'url(')] |
|||
""" |
|||
rgx = r""" |
|||
url\(\s* # Start function |
|||
(?P<url>[^)]*) # URL string |
|||
\s*\) # End function |
|||
""" |
|||
rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE) |
|||
|
|||
# Loop through possible image URLs |
|||
for lap, element in enumerate(doc.xpath(query)): |
|||
if limit and lap >= limit: |
|||
break |
|||
if element.tag == "img": |
|||
yield element.attrib["src"] |
|||
else: |
|||
for rule in element.attrib["style"].split(";"): |
|||
# Extract background image |
|||
parts = rule.split(":", 1) |
|||
try: |
|||
if parts[0].strip().lower() in {"background", |
|||
"background-image"}: |
|||
yield (rgx.search(parts[1]) |
|||
.group("url").strip("\"'")) |
|||
# Malformed CSS or no match for URL |
|||
except (IndexError, AttributeError): |
|||
pass |
After Width: 128 | Height: 128 | Size: 9.2 KiB |
@ -0,0 +1,5 @@ |
|||
# -*- coding: utf-8 -*- |
|||
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
|||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
|||
|
|||
from . import test_extractor |
@ -0,0 +1,69 @@ |
|||
# -*- coding: utf-8 -*- |
|||
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis |
|||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). |
|||
|
|||
from lxml import etree |
|||
from openerp.tests.common import TransactionCase |
|||
|
|||
|
|||
class ExtractorCase(TransactionCase): |
|||
def setUp(self): |
|||
super(ExtractorCase, self).setUp() |
|||
|
|||
# Shortcut |
|||
self.imgs_from_html = self.env["ir.fields.converter"].imgs_from_html |
|||
|
|||
def test_mixed_images_found(self): |
|||
"""Images correctly found in <img> elements and backgrounds.""" |
|||
content = u""" |
|||
<div> |
|||
<!-- src-less img --> |
|||
<img/> |
|||
<p/> |
|||
<img src="/path/0"/> |
|||
<img src="/path/1"/> |
|||
<img src="/path/2"/> |
|||
<img src="/path/3"/> |
|||
<section style="background : URL('/path/4');;background;ö;"> |
|||
<div style='BACKGROUND-IMAGE:url(/path/5)'> |
|||
<p style="background:uRl("/path/6")"> |
|||
<img src="/path/7"/> |
|||
</p> |
|||
</div> |
|||
</section> |
|||
</div> |
|||
""" |
|||
|
|||
# Read all images |
|||
for n, url in enumerate(self.imgs_from_html(content)): |
|||
self.assertEqual("/path/%d" % n, url) |
|||
self.assertEqual(n, 7) |
|||
|
|||
# Read only first image |
|||
for n, url in enumerate(self.imgs_from_html(content, 1)): |
|||
self.assertEqual("/path/%d" % n, url) |
|||
self.assertEqual(n, 0) |
|||
|
|||
def test_empty_html(self): |
|||
"""Empty HTML handled correctly.""" |
|||
for laps, text in self.imgs_from_html(""): |
|||
self.assertTrue(False) # You should never get here |
|||
|
|||
with self.assertRaises(etree.XMLSyntaxError): |
|||
list(self.imgs_from_html("", fail=True)) |
|||
|
|||
def test_false_html(self): |
|||
"""``False`` HTML handled correctly.""" |
|||
for laps, text in self.imgs_from_html(False): |
|||
self.assertTrue(False) # You should never get here |
|||
|
|||
with self.assertRaises(TypeError): |
|||
list(self.imgs_from_html(False, fail=True)) |
|||
|
|||
def test_bad_html(self): |
|||
"""Bad HTML handled correctly.""" |
|||
for laps, text in self.imgs_from_html("<<bad>"): |
|||
self.assertTrue(False) # You should never get here |
|||
|
|||
with self.assertRaises(etree.ParserError): |
|||
list(self.imgs_from_html("<<bad>", fail=True)) |
Write
Preview
Loading…
Cancel
Save
Reference in new issue