1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 from __future__ import absolute_import
35
36 __all__ = [
37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
39 'find_rel_links', 'find_class', 'make_links_absolute',
40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
41
42
43 import copy
44 import sys
45 import re
46 from functools import partial
47
48 try:
49
50 from collections.abc import MutableMapping, MutableSet
51 except ImportError:
52 from collections import MutableMapping, MutableSet
53
54 from .. import etree
55 from . import defs
56 from ._setmixin import SetMixin
57
58 try:
59 from urlparse import urljoin
60 except ImportError:
61
62 from urllib.parse import urljoin
63
64 try:
65 unicode
66 except NameError:
67
68 unicode = str
69 try:
70 basestring
71 except NameError:
72
73 basestring = (str, bytes)
77 if not s:
78 return s
79 if sys.version_info[0] >= 3:
80 sub = re.compile(r"^(\s*)u'", re.M).sub
81 else:
82 sub = re.compile(r"^(\s*)b'", re.M).sub
83 return sub(r"\1'", s)
84
85
86 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
87
88 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
89 namespaces={'x':XHTML_NAMESPACE})
90 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
91 namespaces={'x':XHTML_NAMESPACE})
92 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
93 namespaces={'x':XHTML_NAMESPACE})
94
95 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
96 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
97 _collect_string_content = etree.XPath("string()")
98 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
99 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
100 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
101 namespaces={'x':XHTML_NAMESPACE})
102 _archive_re = re.compile(r'[^ ]+')
103 _parse_meta_refresh_url = re.compile(
104 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
108 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
109 return s[1:-1], pos+1
110 else:
111 return s,pos
112
123
130
133 """Provides access to an element's class attribute as a set-like collection.
134 Usage::
135
136 >>> el = fromstring('<p class="hidden large">Text</p>')
137 >>> classes = el.classes # or: classes = Classes(el.attrib)
138 >>> classes |= ['block', 'paragraph']
139 >>> el.get('class')
140 'hidden large block paragraph'
141 >>> classes.toggle('hidden')
142 False
143 >>> el.get('class')
144 'large block paragraph'
145 >>> classes -= ('some', 'classes', 'block')
146 >>> el.get('class')
147 'large paragraph'
148 """
150 self._attributes = attributes
151 self._get_class_value = partial(attributes.get, 'class', '')
152
153 - def add(self, value):
154 """
155 Add a class.
156
157 This has no effect if the class is already present.
158 """
159 if not value or re.search(r'\s', value):
160 raise ValueError("Invalid class name: %r" % value)
161 classes = self._get_class_value().split()
162 if value in classes:
163 return
164 classes.append(value)
165 self._attributes['class'] = ' '.join(classes)
166
168 """
169 Remove a class if it is currently present.
170
171 If the class is not present, do nothing.
172 """
173 if not value or re.search(r'\s', value):
174 raise ValueError("Invalid class name: %r" % value)
175 classes = [name for name in self._get_class_value().split()
176 if name != value]
177 if classes:
178 self._attributes['class'] = ' '.join(classes)
179 elif 'class' in self._attributes:
180 del self._attributes['class']
181
183 """
184 Remove a class; it must currently be present.
185
186 If the class is not present, raise a KeyError.
187 """
188 if not value or re.search(r'\s', value):
189 raise ValueError("Invalid class name: %r" % value)
190 super(Classes, self).remove(value)
191
195
197 return iter(self._get_class_value().split())
198
200 return len(self._get_class_value().split())
201
202
203
205 """
206 Add all names from 'values'.
207 """
208 classes = self._get_class_value().split()
209 extended = False
210 for value in values:
211 if value not in classes:
212 classes.append(value)
213 extended = True
214 if extended:
215 self._attributes['class'] = ' '.join(classes)
216
218 """
219 Add a class name if it isn't there yet, or remove it if it exists.
220
221 Returns true if the class was added (and is now enabled) and
222 false if it was removed (and is now disabled).
223 """
224 if not value or re.search(r'\s', value):
225 raise ValueError("Invalid class name: %r" % value)
226 classes = self._get_class_value().split()
227 try:
228 classes.remove(value)
229 enabled = False
230 except ValueError:
231 classes.append(value)
232 enabled = True
233 if classes:
234 self._attributes['class'] = ' '.join(classes)
235 else:
236 del self._attributes['class']
237 return enabled
238
241
242 - def set(self, key, value=None):
243 """set(self, key, value=None)
244
245 Sets an element attribute. If no value is provided, or if the value is None,
246 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
247 for ``form.set('novalidate')``.
248 """
249 super(HtmlElement, self).set(key, value)
250
251 @property
253 """
254 A set-like wrapper around the 'class' attribute.
255 """
256 return Classes(self.attrib)
257
258 @classes.setter
266
267 @property
269 """
270 Returns the base URL, given when the page was parsed.
271
272 Use with ``urlparse.urljoin(el.base_url, href)`` to get
273 absolute URLs.
274 """
275 return self.getroottree().docinfo.URL
276
277 @property
283
284 @property
286 """
287 Return the <body> element. Can be called from a child element
288 to get the document's head.
289 """
290 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
291
292 @property
294 """
295 Returns the <head> element. Can be called from a child
296 element to get the document's head.
297 """
298 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
299
300 @property
302 """
303 Get or set any <label> element associated with this element.
304 """
305 id = self.get('id')
306 if not id:
307 return None
308 result = _label_xpath(self, id=id)
309 if not result:
310 return None
311 else:
312 return result[0]
313
314 @label.setter
316 id = self.get('id')
317 if not id:
318 raise TypeError(
319 "You cannot set a label for an element (%r) that has no id"
320 % self)
321 if _nons(label.tag) != 'label':
322 raise TypeError(
323 "You can only assign label to a label element (not %r)"
324 % label)
325 label.set('for', id)
326
327 @label.deleter
332
334 """
335 Removes this element from the tree, including its children and
336 text. The tail text is joined to the previous element or
337 parent.
338 """
339 parent = self.getparent()
340 assert parent is not None
341 if self.tail:
342 previous = self.getprevious()
343 if previous is None:
344 parent.text = (parent.text or '') + self.tail
345 else:
346 previous.tail = (previous.tail or '') + self.tail
347 parent.remove(self)
348
350 """
351 Remove the tag, but not its children or text. The children and text
352 are merged into the parent.
353
354 Example::
355
356 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
357 >>> h.find('.//b').drop_tag()
358 >>> print(tostring(h, encoding='unicode'))
359 <div>Hello World!</div>
360 """
361 parent = self.getparent()
362 assert parent is not None
363 previous = self.getprevious()
364 if self.text and isinstance(self.tag, basestring):
365
366 if previous is None:
367 parent.text = (parent.text or '') + self.text
368 else:
369 previous.tail = (previous.tail or '') + self.text
370 if self.tail:
371 if len(self):
372 last = self[-1]
373 last.tail = (last.tail or '') + self.tail
374 elif previous is None:
375 parent.text = (parent.text or '') + self.tail
376 else:
377 previous.tail = (previous.tail or '') + self.tail
378 index = parent.index(self)
379 parent[index:index+1] = self[:]
380
382 """
383 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
384 """
385 rel = rel.lower()
386 return [el for el in _rel_links_xpath(self)
387 if el.get('rel').lower() == rel]
388
390 """
391 Find any elements with the given class name.
392 """
393 return _class_xpath(self, class_name=class_name)
394
396 """
397 Get the first element in a document with the given id. If none is
398 found, return the default argument if provided or raise KeyError
399 otherwise.
400
401 Note that there can be more than one element with the same id,
402 and this isn't uncommon in HTML documents found in the wild.
403 Browsers return only the first match, and this function does
404 the same.
405 """
406 try:
407
408
409 return _id_xpath(self, id=id)[0]
410 except IndexError:
411 if default:
412 return default[0]
413 else:
414 raise KeyError(id)
415
416 - def text_content(self):
417 """
418 Return the text content of the tag (and the text in any children).
419 """
420 return _collect_string_content(self)
421
422 - def cssselect(self, expr, translator='html'):
423 """
424 Run the CSS expression on this element and its children,
425 returning a list of the results.
426
427 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
428 -- note that pre-compiling the expression can provide a substantial
429 speedup.
430 """
431
432 from lxml.cssselect import CSSSelector
433 return CSSSelector(expr, translator=translator)(self)
434
435
436
437
438
439 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
440 handle_failures=None):
441 """
442 Make all links in the document absolute, given the
443 ``base_url`` for the document (the full URL where the document
444 came from), or if no ``base_url`` is given, then the ``.base_url``
445 of the document.
446
447 If ``resolve_base_href`` is true, then any ``<base href>``
448 tags in the document are used *and* removed from the document.
449 If it is false then any such tag is ignored.
450
451 If ``handle_failures`` is None (default), a failure to process
452 a URL will abort the processing. If set to 'ignore', errors
453 are ignored. If set to 'discard', failing URLs will be removed.
454 """
455 if base_url is None:
456 base_url = self.base_url
457 if base_url is None:
458 raise TypeError(
459 "No base_url given, and the document has no base_url")
460 if resolve_base_href:
461 self.resolve_base_href()
462
463 if handle_failures == 'ignore':
464 def link_repl(href):
465 try:
466 return urljoin(base_url, href)
467 except ValueError:
468 return href
469 elif handle_failures == 'discard':
470 def link_repl(href):
471 try:
472 return urljoin(base_url, href)
473 except ValueError:
474 return None
475 elif handle_failures is None:
476 def link_repl(href):
477 return urljoin(base_url, href)
478 else:
479 raise ValueError(
480 "unexpected value for handle_failures: %r" % handle_failures)
481
482 self.rewrite_links(link_repl)
483
485 """
486 Find any ``<base href>`` tag in the document, and apply its
487 values to all links found in the document. Also remove the
488 tag once it has been applied.
489
490 If ``handle_failures`` is None (default), a failure to process
491 a URL will abort the processing. If set to 'ignore', errors
492 are ignored. If set to 'discard', failing URLs will be removed.
493 """
494 base_href = None
495 basetags = self.xpath('//base[@href]|//x:base[@href]',
496 namespaces={'x': XHTML_NAMESPACE})
497 for b in basetags:
498 base_href = b.get('href')
499 b.drop_tree()
500 if not base_href:
501 return
502 self.make_links_absolute(base_href, resolve_base_href=False,
503 handle_failures=handle_failures)
504
506 """
507 Yield (element, attribute, link, pos), where attribute may be None
508 (indicating the link is in the text). ``pos`` is the position
509 where the link occurs; often 0, but sometimes something else in
510 the case of links in stylesheets or style tags.
511
512 Note: <base href> is *not* taken into account in any way. The
513 link you get is exactly the link in the document.
514
515 Note: multiple links inside of a single text string or
516 attribute value are returned in reversed order. This makes it
517 possible to replace or delete them from the text string value
518 based on their reported text positions. Otherwise, a
519 modification at one text position can change the positions of
520 links reported later on.
521 """
522 link_attrs = defs.link_attrs
523 for el in self.iter(etree.Element):
524 attribs = el.attrib
525 tag = _nons(el.tag)
526 if tag == 'object':
527 codebase = None
528
529
530 if 'codebase' in attribs:
531 codebase = el.get('codebase')
532 yield (el, 'codebase', codebase, 0)
533 for attrib in ('classid', 'data'):
534 if attrib in attribs:
535 value = el.get(attrib)
536 if codebase is not None:
537 value = urljoin(codebase, value)
538 yield (el, attrib, value, 0)
539 if 'archive' in attribs:
540 for match in _archive_re.finditer(el.get('archive')):
541 value = match.group(0)
542 if codebase is not None:
543 value = urljoin(codebase, value)
544 yield (el, 'archive', value, match.start())
545 else:
546 for attrib in link_attrs:
547 if attrib in attribs:
548 yield (el, attrib, attribs[attrib], 0)
549 if tag == 'meta':
550 http_equiv = attribs.get('http-equiv', '').lower()
551 if http_equiv == 'refresh':
552 content = attribs.get('content', '')
553 match = _parse_meta_refresh_url(content)
554 url = (match.group('url') if match else content).strip()
555
556
557 if url:
558 url, pos = _unquote_match(
559 url, match.start('url') if match else content.find(url))
560 yield (el, 'content', url, pos)
561 elif tag == 'param':
562 valuetype = el.get('valuetype') or ''
563 if valuetype.lower() == 'ref':
564
565
566
567
568
569
570 yield (el, 'value', el.get('value'), 0)
571 elif tag == 'style' and el.text:
572 urls = [
573
574 _unquote_match(match.group(1), match.start(1))[::-1]
575 for match in _iter_css_urls(el.text)
576 ] + [
577 (match.start(1), match.group(1))
578 for match in _iter_css_imports(el.text)
579 ]
580 if urls:
581
582
583
584 urls.sort(reverse=True)
585 for start, url in urls:
586 yield (el, None, url, start)
587 if 'style' in attribs:
588 urls = list(_iter_css_urls(attribs['style']))
589 if urls:
590
591 for match in urls[::-1]:
592 url, start = _unquote_match(match.group(1), match.start(1))
593 yield (el, 'style', url, start)
594
595 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
596 base_href=None):
597 """
598 Rewrite all the links in the document. For each link
599 ``link_repl_func(link)`` will be called, and the return value
600 will replace the old link.
601
602 Note that links may not be absolute (unless you first called
603 ``make_links_absolute()``), and may be internal (e.g.,
604 ``'#anchor'``). They can also be values like
605 ``'mailto:email'`` or ``'javascript:expr'``.
606
607 If you give ``base_href`` then all links passed to
608 ``link_repl_func()`` will take that into account.
609
610 If the ``link_repl_func`` returns None, the attribute or
611 tag text will be removed completely.
612 """
613 if base_href is not None:
614
615
616 self.make_links_absolute(
617 base_href, resolve_base_href=resolve_base_href)
618 elif resolve_base_href:
619 self.resolve_base_href()
620
621 for el, attrib, link, pos in self.iterlinks():
622 new_link = link_repl_func(link.strip())
623 if new_link == link:
624 continue
625 if new_link is None:
626
627 if attrib is None:
628 el.text = ''
629 else:
630 del el.attrib[attrib]
631 continue
632
633 if attrib is None:
634 new = el.text[:pos] + new_link + el.text[pos+len(link):]
635 el.text = new
636 else:
637 cur = el.get(attrib)
638 if not pos and len(cur) == len(link):
639 new = new_link
640 else:
641 new = cur[:pos] + new_link + cur[pos+len(link):]
642 el.set(attrib, new)
643
646 """
647 An object that represents a method on an element as a function;
648 the function takes either an element or an HTML string. It
649 returns whatever the function normally returns, or if the function
650 works in-place (and so returns None) it returns a serialized form
651 of the resulting document.
652 """
658 result_type = type(doc)
659 if isinstance(doc, basestring):
660 if 'copy' in kw:
661 raise TypeError(
662 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
663 doc = fromstring(doc, **kw)
664 else:
665 if 'copy' in kw:
666 make_a_copy = kw.pop('copy')
667 else:
668 make_a_copy = self.copy
669 if make_a_copy:
670 doc = copy.deepcopy(doc)
671 meth = getattr(doc, self.name)
672 result = meth(*args, **kw)
673
674 if result is None:
675
676 return _transform_result(result_type, doc)
677 else:
678 return result
679
680
681 find_rel_links = _MethodFunc('find_rel_links', copy=False)
682 find_class = _MethodFunc('find_class', copy=False)
683 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
684 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
685 iterlinks = _MethodFunc('iterlinks', copy=False)
686 rewrite_links = _MethodFunc('rewrite_links', copy=True)
691
697
701
702
703 -class HtmlEntity(etree.EntityBase, HtmlMixin):
705
708 """A lookup scheme for HTML Element classes.
709
710 To create a lookup instance with different Element classes, pass a tag
711 name mapping of Element classes in the ``classes`` keyword argument and/or
712 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
713 The special key '*' denotes a Mixin class that should be mixed into all
714 Element classes.
715 """
716 _default_element_classes = {}
717
718 - def __init__(self, classes=None, mixins=None):
735
736 - def lookup(self, node_type, document, namespace, name):
747
748
749
750
751
752
753 _looks_like_full_html_unicode = re.compile(
754 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
755 _looks_like_full_html_bytes = re.compile(
756 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
771
775 """Parses several HTML elements, returning a list of elements.
776
777 The first item in the list may be a string.
778 If no_leading_text is true, then it will be an error if there is
779 leading text, and it will always be a list of only elements.
780
781 base_url will set the document's base_url attribute
782 (and the tree's docinfo.URL).
783 """
784 if parser is None:
785 parser = html_parser
786
787 if isinstance(html, bytes):
788 if not _looks_like_full_html_bytes(html):
789
790 html = ('<html><body>'.encode('ascii') + html +
791 '</body></html>'.encode('ascii'))
792 else:
793 if not _looks_like_full_html_unicode(html):
794 html = '<html><body>%s</body></html>' % html
795 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
796 assert _nons(doc.tag) == 'html'
797 bodies = [e for e in doc if _nons(e.tag) == 'body']
798 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
799 body = bodies[0]
800 elements = []
801 if no_leading_text and body.text and body.text.strip():
802 raise etree.ParserError(
803 "There is leading text: %r" % body.text)
804 if body.text and body.text.strip():
805 elements.append(body.text)
806 elements.extend(body)
807
808
809 return elements
810
814 """
815 Parses a single HTML element; it is an error if there is more than
816 one element, or if anything but whitespace precedes or follows the
817 element.
818
819 If ``create_parent`` is true (or is a tag name) then a parent node
820 will be created to encapsulate the HTML in a single element. In this
821 case, leading or trailing text is also allowed, as are multiple elements
822 as result of the parsing.
823
824 Passing a ``base_url`` will set the document's ``base_url`` attribute
825 (and the tree's docinfo.URL).
826 """
827 if parser is None:
828 parser = html_parser
829
830 accept_leading_text = bool(create_parent)
831
832 elements = fragments_fromstring(
833 html, parser=parser, no_leading_text=not accept_leading_text,
834 base_url=base_url, **kw)
835
836 if create_parent:
837 if not isinstance(create_parent, basestring):
838 create_parent = 'div'
839 new_root = Element(create_parent)
840 if elements:
841 if isinstance(elements[0], basestring):
842 new_root.text = elements[0]
843 del elements[0]
844 new_root.extend(elements)
845 return new_root
846
847 if not elements:
848 raise etree.ParserError('No elements found')
849 if len(elements) > 1:
850 raise etree.ParserError(
851 "Multiple elements found (%s)"
852 % ', '.join([_element_name(e) for e in elements]))
853 el = elements[0]
854 if el.tail and el.tail.strip():
855 raise etree.ParserError(
856 "Element followed by text: %r" % el.tail)
857 el.tail = None
858 return el
859
860
861 -def fromstring(html, base_url=None, parser=None, **kw):
927
928
929 -def parse(filename_or_url, parser=None, base_url=None, **kw):
930 """
931 Parse a filename, URL, or file-like object into an HTML document
932 tree. Note: this returns a tree, not an element. Use
933 ``parse(...).getroot()`` to get the document root.
934
935 You can override the base URL with the ``base_url`` keyword. This
936 is most useful when parsing from a file-like object.
937 """
938 if parser is None:
939 parser = html_parser
940 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
941
950
953 if isinstance(el, etree.CommentBase):
954 return 'comment'
955 elif isinstance(el, basestring):
956 return 'string'
957 else:
958 return _nons(el.tag)
959
1079
1080
1081 HtmlElementClassLookup._default_element_classes['form'] = FormElement
1120
1123 if not url:
1124 raise ValueError("cannot submit, no URL provided")
1125
1126 try:
1127 from urllib import urlencode, urlopen
1128 except ImportError:
1129 from urllib.request import urlopen
1130 from urllib.parse import urlencode
1131 if method == 'GET':
1132 if '?' in url:
1133 url += '&'
1134 else:
1135 url += '?'
1136 url += urlencode(values)
1137 data = None
1138 else:
1139 data = urlencode(values)
1140 if not isinstance(data, bytes):
1141 data = data.encode('ASCII')
1142 return urlopen(url, data)
1143
1146
1154 raise KeyError(
1155 "You cannot remove keys from ElementDict")
1159 return item in self.inputs
1164
1166 return '<%s for form %s>' % (
1167 self.__class__.__name__,
1168 self.inputs.form._name())
1169
1236
1267
1268
1269 -class TextareaElement(InputMixin, HtmlElement):
1270 """
1271 ``<textarea>`` element. You can get the name with ``.name`` and
1272 get/set the value with ``.value``
1273 """
1274 @property
1276 """
1277 Get/set the value (which is the contents of this element)
1278 """
1279 content = self.text or ''
1280 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1281 serialisation_method = 'xml'
1282 else:
1283 serialisation_method = 'html'
1284 for el in self:
1285
1286 content += etree.tostring(
1287 el, method=serialisation_method, encoding='unicode')
1288 return content
1289
1290 @value.setter
1291 - def value(self, value):
1292 del self[:]
1293 self.text = value
1294
1295 @value.deleter
1297 self.text = ''
1298 del self[:]
1299
1300
1301 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1305 """
1306 ``<select>`` element. You can get the name with ``.name``.
1307
1308 ``.value`` will be the value of the selected option, unless this
1309 is a multi-select element (``<select multiple>``), in which case
1310 it will be a set-like object. In either case ``.value_options``
1311 gives the possible values.
1312
1313 The boolean attribute ``.multiple`` shows if this is a
1314 multi-select.
1315 """
1316 @property
1318 """
1319 Get/set the value of this select (the selected option).
1320
1321 If this is a multi-select, this is a set-like object that
1322 represents all the selected options.
1323 """
1324 if self.multiple:
1325 return MultipleSelectOptions(self)
1326 options = _options_xpath(self)
1327
1328 try:
1329 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1330 except StopIteration:
1331 try:
1332 selected_option = next(el for el in options if el.get('disabled') is None)
1333 except StopIteration:
1334 return None
1335 value = selected_option.get('value')
1336 if value is None:
1337 value = (selected_option.text or '').strip()
1338 return value
1339
1340 @value.setter
1341 - def value(self, value):
1342 if self.multiple:
1343 if isinstance(value, basestring):
1344 raise TypeError("You must pass in a sequence")
1345 values = self.value
1346 values.clear()
1347 values.update(value)
1348 return
1349 checked_option = None
1350 if value is not None:
1351 for el in _options_xpath(self):
1352 opt_value = el.get('value')
1353 if opt_value is None:
1354 opt_value = (el.text or '').strip()
1355 if opt_value == value:
1356 checked_option = el
1357 break
1358 else:
1359 raise ValueError(
1360 "There is no option with the value of %r" % value)
1361 for el in _options_xpath(self):
1362 if 'selected' in el.attrib:
1363 del el.attrib['selected']
1364 if checked_option is not None:
1365 checked_option.set('selected', '')
1366
1367 @value.deleter
1374
1375 @property
1388
1389 @property
1391 """
1392 Boolean attribute: is there a ``multiple`` attribute on this element.
1393 """
1394 return 'multiple' in self.attrib
1395
1396 @multiple.setter
1398 if value:
1399 self.set('multiple', '')
1400 elif 'multiple' in self.attrib:
1401 del self.attrib['multiple']
1402
1403
1404 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1408 """
1409 Represents all the selected options in a ``<select multiple>`` element.
1410
1411 You can add to this set-like option to select an option, or remove
1412 to unselect the option.
1413 """
1414
1416 self.select = select
1417
1418 @property
1420 """
1421 Iterator of all the ``<option>`` elements.
1422 """
1423 return iter(_options_xpath(self.select))
1424
1426 for option in self.options:
1427 if 'selected' in option.attrib:
1428 opt_value = option.get('value')
1429 if opt_value is None:
1430 opt_value = (option.text or '').strip()
1431 yield opt_value
1432
1433 - def add(self, item):
1434 for option in self.options:
1435 opt_value = option.get('value')
1436 if opt_value is None:
1437 opt_value = (option.text or '').strip()
1438 if opt_value == item:
1439 option.set('selected', '')
1440 break
1441 else:
1442 raise ValueError(
1443 "There is no option with the value %r" % item)
1444
1446 for option in self.options:
1447 opt_value = option.get('value')
1448 if opt_value is None:
1449 opt_value = (option.text or '').strip()
1450 if opt_value == item:
1451 if 'selected' in option.attrib:
1452 del option.attrib['selected']
1453 else:
1454 raise ValueError(
1455 "The option %r is not currently selected" % item)
1456 break
1457 else:
1458 raise ValueError(
1459 "There is not option with the value %r" % item)
1460
1462 return '<%s {%s} for select name=%r>' % (
1463 self.__class__.__name__,
1464 ', '.join([repr(v) for v in self]),
1465 self.select.name)
1466
1469 """
1470 This object represents several ``<input type=radio>`` elements
1471 that have the same name.
1472
1473 You can use this like a list, but also use the property
1474 ``.value`` to check/uncheck inputs. Also you can use
1475 ``.value_options`` to get the possible values.
1476 """
1477 @property
1479 """
1480 Get/set the value, which checks the radio with that value (and
1481 unchecks any other value).
1482 """
1483 for el in self:
1484 if 'checked' in el.attrib:
1485 return el.get('value')
1486 return None
1487
1488 @value.setter
1489 - def value(self, value):
1490 checked_option = None
1491 if value is not None:
1492 for el in self:
1493 if el.get('value') == value:
1494 checked_option = el
1495 break
1496 else:
1497 raise ValueError("There is no radio input with the value %r" % value)
1498 for el in self:
1499 if 'checked' in el.attrib:
1500 del el.attrib['checked']
1501 if checked_option is not None:
1502 checked_option.set('checked', '')
1503
1504 @value.deleter
1507
1508 @property
1510 """
1511 Returns a list of all the possible values.
1512 """
1513 return [el.get('value') for el in self]
1514
1516 return '%s(%s)' % (
1517 self.__class__.__name__,
1518 list.__repr__(self))
1519
1522 """
1523 Represents a group of checkboxes (``<input type=checkbox>``) that
1524 have the same name.
1525
1526 In addition to using this like a list, the ``.value`` attribute
1527 returns a set-like object that you can add to or remove from to
1528 check and uncheck checkboxes. You can also use ``.value_options``
1529 to get the possible values.
1530 """
1531 @property
1533 """
1534 Return a set-like object that can be modified to check or
1535 uncheck individual checkboxes according to their value.
1536 """
1537 return CheckboxValues(self)
1538
1539 @value.setter
1540 - def value(self, value):
1548
1549 @value.deleter
1552
1553 @property
1555 """
1556 Returns a list of all the possible values.
1557 """
1558 return [el.get('value') for el in self]
1559
1561 return '%s(%s)' % (
1562 self.__class__.__name__, list.__repr__(self))
1563
1566 """
1567 Represents the values of the checked checkboxes in a group of
1568 checkboxes with the same name.
1569 """
1570
1573
1575 return iter([
1576 el.get('value')
1577 for el in self.group
1578 if 'checked' in el.attrib])
1579
1580 - def add(self, value):
1581 for el in self.group:
1582 if el.get('value') == value:
1583 el.set('checked', '')
1584 break
1585 else:
1586 raise KeyError("No checkbox with value %r" % value)
1587
1589 for el in self.group:
1590 if el.get('value') == value:
1591 if 'checked' in el.attrib:
1592 del el.attrib['checked']
1593 else:
1594 raise KeyError(
1595 "The checkbox with value %r was already unchecked" % value)
1596 break
1597 else:
1598 raise KeyError(
1599 "No checkbox with value %r" % value)
1600
1602 return '<%s {%s} for checkboxes name=%r>' % (
1603 self.__class__.__name__,
1604 ', '.join([repr(v) for v in self]),
1605 self.group.name)
1606
1700
1701
1702 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1706 """
1707 Represents a ``<label>`` element.
1708
1709 Label elements are linked to other elements with their ``for``
1710 attribute. You can access this element with ``label.for_element``.
1711 """
1712 @property
1714 """
1715 Get/set the element this label points to. Return None if it
1716 can't be found.
1717 """
1718 id = self.get('for')
1719 if not id:
1720 return None
1721 return self.body.get_element_by_id(id)
1722
1723 @for_element.setter
1725 id = other.get('id')
1726 if not id:
1727 raise TypeError(
1728 "Element %r has no id attribute" % other)
1729 self.set('for', id)
1730
1731 @for_element.deleter
1736
1737
1738 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1758
1761 """Convert all tags in an XHTML tree to HTML by removing their
1762 XHTML namespace.
1763 """
1764 try:
1765 xhtml = xhtml.getroot()
1766 except AttributeError:
1767 pass
1768 prefix = "{%s}" % XHTML_NAMESPACE
1769 prefix_len = len(prefix)
1770 for el in xhtml.iter(prefix + "*"):
1771 el.tag = el.tag[prefix_len:]
1772
1773
1774
1775
1776 __str_replace_meta_content_type = re.compile(
1777 r'<meta http-equiv="Content-Type"[^>]*>').sub
1778 __bytes_replace_meta_content_type = re.compile(
1779 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1780
1781
1782 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1783 encoding=None, method="html", with_tail=True, doctype=None):
1784 """Return an HTML string representation of the document.
1785
1786 Note: if include_meta_content_type is true this will create a
1787 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1788 regardless of the value of include_meta_content_type any existing
1789 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1790
1791 The ``encoding`` argument controls the output encoding (defauts to
1792 ASCII, with &#...; character references for any characters outside
1793 of ASCII). Note that you can pass the name ``'unicode'`` as
1794 ``encoding`` argument to serialise to a Unicode string.
1795
1796 The ``method`` argument defines the output method. It defaults to
1797 'html', but can also be 'xml' for xhtml output, or 'text' to
1798 serialise to plain text without markup.
1799
1800 To leave out the tail text of the top-level element that is being
1801 serialised, pass ``with_tail=False``.
1802
1803 The ``doctype`` option allows passing in a plain string that will
1804 be serialised before the XML tree. Note that passing in non
1805 well-formed content here will make the XML output non well-formed.
1806 Also, an existing doctype in the document tree will not be removed
1807 when serialising an ElementTree instance.
1808
1809 Example::
1810
1811 >>> from lxml import html
1812 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1813
1814 >>> html.tostring(root)
1815 b'<p>Hello<br>world!</p>'
1816 >>> html.tostring(root, method='html')
1817 b'<p>Hello<br>world!</p>'
1818
1819 >>> html.tostring(root, method='xml')
1820 b'<p>Hello<br/>world!</p>'
1821
1822 >>> html.tostring(root, method='text')
1823 b'Helloworld!'
1824
1825 >>> html.tostring(root, method='text', encoding='unicode')
1826 u'Helloworld!'
1827
1828 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1829 >>> html.tostring(root[0], method='text', encoding='unicode')
1830 u'Helloworld!TAIL'
1831
1832 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1833 u'Helloworld!'
1834
1835 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1836 >>> html.tostring(doc, method='html', encoding='unicode')
1837 u'<html><body><p>Hello<br>world!</p></body></html>'
1838
1839 >>> print(html.tostring(doc, method='html', encoding='unicode',
1840 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1841 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1842 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1843 <html><body><p>Hello<br>world!</p></body></html>
1844 """
1845 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1846 encoding=encoding, with_tail=with_tail,
1847 doctype=doctype)
1848 if method == 'html' and not include_meta_content_type:
1849 if isinstance(html, str):
1850 html = __str_replace_meta_content_type('', html)
1851 else:
1852 html = __bytes_replace_meta_content_type(bytes(), html)
1853 return html
1854
1855
1856 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1860 """
1861 Open the HTML document in a web browser, saving it to a temporary
1862 file to open it. Note that this does not delete the file after
1863 use. This is mainly meant for debugging.
1864 """
1865 import os
1866 import webbrowser
1867 import tempfile
1868 if not isinstance(doc, etree._ElementTree):
1869 doc = etree.ElementTree(doc)
1870 handle, fn = tempfile.mkstemp(suffix='.html')
1871 f = os.fdopen(handle, 'wb')
1872 try:
1873 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1874 finally:
1875
1876 f.close()
1877 url = 'file://' + fn.replace(os.path.sep, '/')
1878 print(url)
1879 webbrowser.open(url)
1880
1881
1882
1883
1884
1885
1886 -class HTMLParser(etree.HTMLParser):
1887 """An HTML parser that is configured to return lxml.html Element
1888 objects.
1889 """
1893
1896 """An XML parser that is configured to return lxml.html Element
1897 objects.
1898
1899 Note that this parser is not really XHTML aware unless you let it
1900 load a DTD that declares the HTML entities. To do this, make sure
1901 you have the XHTML DTDs installed in your catalogs, and create the
1902 parser like this::
1903
1904 >>> parser = XHTMLParser(load_dtd=True)
1905
1906 If you additionally want to validate the document, use this::
1907
1908 >>> parser = XHTMLParser(dtd_validation=True)
1909
1910 For catalog support, see http://www.xmlsoft.org/catalog.html.
1911 """
1915
1918 """Create a new HTML Element.
1919
1920 This can also be used for XHTML documents.
1921 """
1922 v = html_parser.makeelement(*args, **kw)
1923 return v
1924
1925
1926 html_parser = HTMLParser()
1927 xhtml_parser = XHTMLParser()
1928