1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 try:
7 from urlparse import urljoin
8 except ImportError:
9
10 from urllib.parse import urljoin
11 import copy
12 from lxml import etree
13 from lxml.html import defs
14 from lxml import cssselect
15 from lxml.html._setmixin import SetMixin
16 try:
17 from UserDict import DictMixin
18 except ImportError:
19
20 from lxml.html._dictmixin import DictMixin
21 try:
22 set
23 except NameError:
24
25 from sets import Set as set
26 try:
27 bytes = __builtins__["bytes"]
28 except (KeyError, NameError):
29
30 bytes = str
31 try:
32 unicode = __builtins__["unicode"]
33 except (KeyError, NameError):
34
35 unicode = str
36 try:
37 basestring = __builtins__["basestring"]
38 except (KeyError, NameError):
39
40 basestring = (str, bytes)
41
43 if not s:
44 return s
45 import sys
46 if sys.version_info[0] >= 3:
47 sub = re.compile(r"^(\s*)u'", re.M).sub
48 else:
49 sub = re.compile(r"^(\s*)b'", re.M).sub
50 return sub(r"\1'", s)
51
52 __all__ = [
53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
55 'find_rel_links', 'find_class', 'make_links_absolute',
56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
57
58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
59
60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
61 namespaces={'x':XHTML_NAMESPACE})
62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
63 namespaces={'x':XHTML_NAMESPACE})
64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
65 namespaces={'x':XHTML_NAMESPACE})
66
67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
69 _collect_string_content = etree.XPath("string()")
70 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
71 _css_import_re = re.compile(r'@import "(.*?)"')
72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
73 namespaces={'x':XHTML_NAMESPACE})
74 _archive_re = re.compile(r'[^ ]+')
75
77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
78 return s[1:-1], pos+1
79 else:
80 return s,pos
81
91
97
99
101 """
102 Returns the base URL, given when the page was parsed.
103
104 Use with ``urlparse.urljoin(el.base_url, href)`` to get
105 absolute URLs.
106 """
107 return self.getroottree().docinfo.URL
108 base_url = property(base_url, doc=base_url.__doc__)
109
115 forms = property(forms, doc=forms.__doc__)
116
118 """
119 Return the <body> element. Can be called from a child element
120 to get the document's head.
121 """
122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
123 body = property(body, doc=body.__doc__)
124
126 """
127 Returns the <head> element. Can be called from a child
128 element to get the document's head.
129 """
130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
131 head = property(head, doc=head.__doc__)
132
134 """
135 Get or set any <label> element associated with this element.
136 """
137 id = self.get('id')
138 if not id:
139 return None
140 result = _label_xpath(self, id=id)
141 if not result:
142 return None
143 else:
144 return result[0]
146 id = self.get('id')
147 if not id:
148 raise TypeError(
149 "You cannot set a label for an element (%r) that has no id"
150 % self)
151 if _nons(label.tag) != 'label':
152 raise TypeError(
153 "You can only assign label to a label element (not %r)"
154 % label)
155 label.set('for', id)
160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
161
163 """
164 Removes this element from the tree, including its children and
165 text. The tail text is joined to the previous element or
166 parent.
167 """
168 parent = self.getparent()
169 assert parent is not None
170 if self.tail:
171 previous = self.getprevious()
172 if previous is None:
173 parent.text = (parent.text or '') + self.tail
174 else:
175 previous.tail = (previous.tail or '') + self.tail
176 parent.remove(self)
177
179 """
180 Remove the tag, but not its children or text. The children and text
181 are merged into the parent.
182
183 Example::
184
185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
186 >>> h.find('.//b').drop_tag()
187 >>> print(tostring(h, encoding=unicode))
188 <div>Hello World!</div>
189 """
190 parent = self.getparent()
191 assert parent is not None
192 previous = self.getprevious()
193 if self.text and isinstance(self.tag, basestring):
194
195 if previous is None:
196 parent.text = (parent.text or '') + self.text
197 else:
198 previous.tail = (previous.tail or '') + self.text
199 if self.tail:
200 if len(self):
201 last = self[-1]
202 last.tail = (last.tail or '') + self.tail
203 elif previous is None:
204 parent.text = (parent.text or '') + self.tail
205 else:
206 previous.tail = (previous.tail or '') + self.tail
207 index = parent.index(self)
208 parent[index:index+1] = self[:]
209
211 """
212 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
213 """
214 rel = rel.lower()
215 return [el for el in _rel_links_xpath(self)
216 if el.get('rel').lower() == rel]
217
219 """
220 Find any elements with the given class name.
221 """
222 return _class_xpath(self, class_name=class_name)
223
225 """
226 Get the first element in a document with the given id. If none is
227 found, return the default argument if provided or raise KeyError
228 otherwise.
229
230 Note that there can be more than one element with the same id,
231 and this isn't uncommon in HTML documents found in the wild.
232 Browsers return only the first match, and this function does
233 the same.
234 """
235 try:
236
237
238 return _id_xpath(self, id=id)[0]
239 except IndexError:
240 if default:
241 return default[0]
242 else:
243 raise KeyError(id)
244
245 - def text_content(self):
246 """
247 Return the text content of the tag (and the text in any children).
248 """
249 return _collect_string_content(self)
250
252 """
253 Run the CSS expression on this element and its children,
254 returning a list of the results.
255
256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
257 that pre-compiling the expression can provide a substantial
258 speedup.
259 """
260 return cssselect.CSSSelector(expr)(self)
261
262
263
264
265
267 """
268 Make all links in the document absolute, given the
269 ``base_url`` for the document (the full URL where the document
270 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
271
272 If ``resolve_base_href`` is true, then any ``<base href>``
273 tags in the document are used *and* removed from the document.
274 If it is false then any such tag is ignored.
275 """
276 if base_url is None:
277 base_url = self.base_url
278 if base_url is None:
279 raise TypeError(
280 "No base_url given, and the document has no base_url")
281 if resolve_base_href:
282 self.resolve_base_href()
283 def link_repl(href):
284 return urljoin(base_url, href)
285 self.rewrite_links(link_repl)
286
288 """
289 Find any ``<base href>`` tag in the document, and apply its
290 values to all links found in the document. Also remove the
291 tag once it has been applied.
292 """
293 base_href = None
294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
295 for b in basetags:
296 base_href = b.get('href')
297 b.drop_tree()
298 if not base_href:
299 return
300 self.make_links_absolute(base_href, resolve_base_href=False)
301
303 """
304 Yield (element, attribute, link, pos), where attribute may be None
305 (indicating the link is in the text). ``pos`` is the position
306 where the link occurs; often 0, but sometimes something else in
307 the case of links in stylesheets or style tags.
308
309 Note: <base href> is *not* taken into account in any way. The
310 link you get is exactly the link in the document.
311
312 Note: multiple links inside of a single text string or
313 attribute value are returned in reversed order. This makes it
314 possible to replace or delete them from the text string value
315 based on their reported text positions. Otherwise, a
316 modification at one text position can change the positions of
317 links reported later on.
318 """
319 link_attrs = defs.link_attrs
320 for el in self.iter():
321 attribs = el.attrib
322 tag = _nons(el.tag)
323 if tag != 'object':
324 for attrib in link_attrs:
325 if attrib in attribs:
326 yield (el, attrib, attribs[attrib], 0)
327 elif tag == 'object':
328 codebase = None
329
330
331 if 'codebase' in attribs:
332 codebase = el.get('codebase')
333 yield (el, 'codebase', codebase, 0)
334 for attrib in 'classid', 'data':
335 if attrib in attribs:
336 value = el.get(attrib)
337 if codebase is not None:
338 value = urljoin(codebase, value)
339 yield (el, attrib, value, 0)
340 if 'archive' in attribs:
341 for match in _archive_re.finditer(el.get('archive')):
342 value = match.group(0)
343 if codebase is not None:
344 value = urljoin(codebase, value)
345 yield (el, 'archive', value, match.start())
346 if tag == 'param':
347 valuetype = el.get('valuetype') or ''
348 if valuetype.lower() == 'ref':
349
350
351
352
353
354
355 yield (el, 'value', el.get('value'), 0)
356 if tag == 'style' and el.text:
357 urls = [
358 _unquote_match(match.group(1), match.start(1))
359 for match in _css_url_re.finditer(el.text)
360 ] + [
361 (match.group(1), match.start(1))
362 for match in _css_import_re.finditer(el.text)
363 ]
364 if urls:
365
366 urls = [ (start, url) for (url, start) in urls ]
367 urls.sort()
368
369
370 urls.reverse()
371 for start, url in urls:
372 yield (el, None, url, start)
373 if 'style' in attribs:
374 urls = list(_css_url_re.finditer(attribs['style']))
375 if urls:
376
377 for match in urls[::-1]:
378 url, start = _unquote_match(match.group(1), match.start(1))
379 yield (el, 'style', url, start)
380
381 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
382 base_href=None):
383 """
384 Rewrite all the links in the document. For each link
385 ``link_repl_func(link)`` will be called, and the return value
386 will replace the old link.
387
388 Note that links may not be absolute (unless you first called
389 ``make_links_absolute()``), and may be internal (e.g.,
390 ``'#anchor'``). They can also be values like
391 ``'mailto:email'`` or ``'javascript:expr'``.
392
393 If you give ``base_href`` then all links passed to
394 ``link_repl_func()`` will take that into account.
395
396 If the ``link_repl_func`` returns None, the attribute or
397 tag text will be removed completely.
398 """
399 if base_href is not None:
400
401
402 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
403 elif resolve_base_href:
404 self.resolve_base_href()
405 for el, attrib, link, pos in self.iterlinks():
406 new_link = link_repl_func(link.strip())
407 if new_link == link:
408 continue
409 if new_link is None:
410
411 if attrib is None:
412 el.text = ''
413 else:
414 del el.attrib[attrib]
415 continue
416 if attrib is None:
417 new = el.text[:pos] + new_link + el.text[pos+len(link):]
418 el.text = new
419 else:
420 cur = el.attrib[attrib]
421 if not pos and len(cur) == len(link):
422
423 el.attrib[attrib] = new_link
424 else:
425 new = cur[:pos] + new_link + cur[pos+len(link):]
426 el.attrib[attrib] = new
427
428
430 """
431 An object that represents a method on an element as a function;
432 the function takes either an element or an HTML string. It
433 returns whatever the function normally returns, or if the function
434 works in-place (and so returns None) it returns a serialized form
435 of the resulting document.
436 """
442 result_type = type(doc)
443 if isinstance(doc, basestring):
444 if 'copy' in kw:
445 raise TypeError(
446 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
447 doc = fromstring(doc, **kw)
448 else:
449 if 'copy' in kw:
450 copy = kw.pop('copy')
451 else:
452 copy = self.copy
453 if copy:
454 doc = copy.deepcopy(doc)
455 meth = getattr(doc, self.name)
456 result = meth(*args, **kw)
457
458 if result is None:
459
460 return _transform_result(result_type, doc)
461 else:
462 return result
463
464 find_rel_links = _MethodFunc('find_rel_links', copy=False)
465 find_class = _MethodFunc('find_class', copy=False)
466 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
467 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
468 iterlinks = _MethodFunc('iterlinks', copy=False)
469 rewrite_links = _MethodFunc('rewrite_links', copy=True)
470
473
476
479
482
483
485 """A lookup scheme for HTML Element classes.
486
487 To create a lookup instance with different Element classes, pass a tag
488 name mapping of Element classes in the ``classes`` keyword argument and/or
489 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
490 The special key '*' denotes a Mixin class that should be mixed into all
491 Element classes.
492 """
493 _default_element_classes = {}
494
495 - def __init__(self, classes=None, mixins=None):
512
513 - def lookup(self, node_type, document, namespace, name):
524
525
526
527
528
537
540 """
541 Parses several HTML elements, returning a list of elements.
542
543 The first item in the list may be a string (though leading
544 whitespace is removed). If no_leading_text is true, then it will
545 be an error if there is leading text, and it will always be a list
546 of only elements.
547
548 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
549 """
550 if parser is None:
551 parser = html_parser
552
553 start = html[:20].lstrip().lower()
554 if not start.startswith('<html') and not start.startswith('<!doctype'):
555 html = '<html><body>%s</body></html>' % html
556 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
557 assert _nons(doc.tag) == 'html'
558 bodies = [e for e in doc if _nons(e.tag) == 'body']
559 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
560 body = bodies[0]
561 elements = []
562 if no_leading_text and body.text and body.text.strip():
563 raise etree.ParserError(
564 "There is leading text: %r" % body.text)
565 if body.text and body.text.strip():
566 elements.append(body.text)
567 elements.extend(body)
568
569
570 return elements
571
574 """
575 Parses a single HTML element; it is an error if there is more than
576 one element, or if anything but whitespace precedes or follows the
577 element.
578
579 If create_parent is true (or is a tag name) then a parent node
580 will be created to encapsulate the HTML in a single element. In
581 this case, leading or trailing text is allowed.
582
583 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
584 """
585 if parser is None:
586 parser = html_parser
587
588 accept_leading_text = bool(create_parent)
589
590 elements = fragments_fromstring(
591 html, parser=parser, no_leading_text=not accept_leading_text,
592 base_url=base_url, **kw)
593
594 if create_parent:
595 if not isinstance(create_parent, basestring):
596 create_parent = 'div'
597 new_root = Element(create_parent)
598 if elements:
599 if isinstance(elements[0], basestring):
600 new_root.text = elements[0]
601 del elements[0]
602 new_root.extend(elements)
603 return new_root
604
605 if not elements:
606 raise etree.ParserError('No elements found')
607 if len(elements) > 1:
608 raise etree.ParserError(
609 "Multiple elements found (%s)"
610 % ', '.join([_element_name(e) for e in elements]))
611 el = elements[0]
612 if el.tail and el.tail.strip():
613 raise etree.ParserError(
614 "Element followed by text: %r" % el.tail)
615 el.tail = None
616 return el
617
618 -def fromstring(html, base_url=None, parser=None, **kw):
680
681 -def parse(filename_or_url, parser=None, base_url=None, **kw):
682 """
683 Parse a filename, URL, or file-like object into an HTML document
684 tree. Note: this returns a tree, not an element. Use
685 ``parse(...).getroot()`` to get the document root.
686
687 You can override the base URL with the ``base_url`` keyword. This
688 is most useful when parsing from a file-like object.
689 """
690 if parser is None:
691 parser = html_parser
692 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
693
701
703 if isinstance(el, etree.CommentBase):
704 return 'comment'
705 elif isinstance(el, basestring):
706 return 'string'
707 else:
708 return _nons(el.tag)
709
710
711
712
713
818
819 HtmlElementClassLookup._default_element_classes['form'] = FormElement
820
857
859 if not url:
860 raise ValueError("cannot submit, no URL provided")
861
862 try:
863 from urllib import urlencode, urlopen
864 except ImportError:
865 from urllib.request import urlopen
866 from urllib.parse import urlencode
867 if method == 'GET':
868 if '?' in url:
869 url += '&'
870 else:
871 url += '?'
872 url += urlencode(values)
873 data = None
874 else:
875 data = urlencode(values)
876 return urlopen(url, data)
877
879
887 raise KeyError(
888 "You cannot remove keys from ElementDict")
892 return item in self.inputs
893
895 return '<%s for form %s>' % (
896 self.__class__.__name__,
897 self.inputs.form._name())
898
964
992
993 -class TextareaElement(InputMixin, HtmlElement):
994 """
995 ``<textarea>`` element. You can get the name with ``.name`` and
996 get/set the value with ``.value``
997 """
998
999 - def _value__get(self):
1000 """
1001 Get/set the value (which is the contents of this element)
1002 """
1003 content = self.text or ''
1004 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1005 serialisation_method = 'xml'
1006 else:
1007 serialisation_method = 'html'
1008 for el in self:
1009
1010 content += etree.tostring(el, method=serialisation_method, encoding=unicode)
1011 return content
1012 - def _value__set(self, value):
1013 del self[:]
1014 self.text = value
1015 - def _value__del(self):
1016 self.text = ''
1017 del self[:]
1018 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1019
1020 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1021
1023 """
1024 ``<select>`` element. You can get the name with ``.name``.
1025
1026 ``.value`` will be the value of the selected option, unless this
1027 is a multi-select element (``<select multiple>``), in which case
1028 it will be a set-like object. In either case ``.value_options``
1029 gives the possible values.
1030
1031 The boolean attribute ``.multiple`` shows if this is a
1032 multi-select.
1033 """
1034
1036 """
1037 Get/set the value of this select (the selected option).
1038
1039 If this is a multi-select, this is a set-like object that
1040 represents all the selected options.
1041 """
1042 if self.multiple:
1043 return MultipleSelectOptions(self)
1044 for el in _options_xpath(self):
1045 if el.get('selected') is not None:
1046 value = el.get('value')
1047 if value is None:
1048 value = el.text or ''
1049 if value:
1050 value = value.strip()
1051 return value
1052 return None
1053
1055 if self.multiple:
1056 if isinstance(value, basestring):
1057 raise TypeError(
1058 "You must pass in a sequence")
1059 self.value.clear()
1060 self.value.update(value)
1061 return
1062 if value is not None:
1063 value = value.strip()
1064 for el in _options_xpath(self):
1065 opt_value = el.get('value')
1066 if opt_value is None:
1067 opt_value = el.text or ''
1068 if opt_value:
1069 opt_value = opt_value.strip()
1070 if opt_value == value:
1071 checked_option = el
1072 break
1073 else:
1074 raise ValueError(
1075 "There is no option with the value of %r" % value)
1076 for el in _options_xpath(self):
1077 if 'selected' in el.attrib:
1078 del el.attrib['selected']
1079 if value is not None:
1080 checked_option.set('selected', '')
1081
1088
1089 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1090
1105 value_options = property(value_options, doc=value_options.__doc__)
1106
1108 """
1109 Boolean attribute: is there a ``multiple`` attribute on this element.
1110 """
1111 return 'multiple' in self.attrib
1113 if value:
1114 self.set('multiple', '')
1115 elif 'multiple' in self.attrib:
1116 del self.attrib['multiple']
1117 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1118
1119 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1120
1122 """
1123 Represents all the selected options in a ``<select multiple>`` element.
1124
1125 You can add to this set-like option to select an option, or remove
1126 to unselect the option.
1127 """
1128
1130 self.select = select
1131
1133 """
1134 Iterator of all the ``<option>`` elements.
1135 """
1136 return iter(_options_xpath(self.select))
1137 options = property(options)
1138
1140 for option in self.options:
1141 yield option.get('value')
1142
1143 - def add(self, item):
1144 for option in self.options:
1145 if option.get('value') == item:
1146 option.set('selected', '')
1147 break
1148 else:
1149 raise ValueError(
1150 "There is no option with the value %r" % item)
1151
1153 for option in self.options:
1154 if option.get('value') == item:
1155 if 'selected' in option.attrib:
1156 del option.attrib['selected']
1157 else:
1158 raise ValueError(
1159 "The option %r is not currently selected" % item)
1160 break
1161 else:
1162 raise ValueError(
1163 "There is not option with the value %r" % item)
1164
1166 return '<%s {%s} for select name=%r>' % (
1167 self.__class__.__name__,
1168 ', '.join([repr(v) for v in self]),
1169 self.select.name)
1170
1172 """
1173 This object represents several ``<input type=radio>`` elements
1174 that have the same name.
1175
1176 You can use this like a list, but also use the property
1177 ``.value`` to check/uncheck inputs. Also you can use
1178 ``.value_options`` to get the possible values.
1179 """
1180
1182 """
1183 Get/set the value, which checks the radio with that value (and
1184 unchecks any other value).
1185 """
1186 for el in self:
1187 if 'checked' in el.attrib:
1188 return el.get('value')
1189 return None
1190
1192 if value is not None:
1193 for el in self:
1194 if el.get('value') == value:
1195 checked_option = el
1196 break
1197 else:
1198 raise ValueError(
1199 "There is no radio input with the value %r" % value)
1200 for el in self:
1201 if 'checked' in el.attrib:
1202 del el.attrib['checked']
1203 if value is not None:
1204 checked_option.set('checked', '')
1205
1208
1209 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1210
1212 """
1213 Returns a list of all the possible values.
1214 """
1215 return [el.get('value') for el in self]
1216 value_options = property(value_options, doc=value_options.__doc__)
1217
1219 return '%s(%s)' % (
1220 self.__class__.__name__,
1221 list.__repr__(self))
1222
1224 """
1225 Represents a group of checkboxes (``<input type=checkbox>``) that
1226 have the same name.
1227
1228 In addition to using this like a list, the ``.value`` attribute
1229 returns a set-like object that you can add to or remove from to
1230 check and uncheck checkboxes. You can also use ``.value_options``
1231 to get the possible values.
1232 """
1233
1235 """
1236 Return a set-like object that can be modified to check or
1237 uncheck individual checkboxes according to their value.
1238 """
1239 return CheckboxValues(self)
1249 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1250
1252 return '%s(%s)' % (
1253 self.__class__.__name__, list.__repr__(self))
1254
1256
1257 """
1258 Represents the values of the checked checkboxes in a group of
1259 checkboxes with the same name.
1260 """
1261
1264
1266 return iter([
1267 el.get('value')
1268 for el in self.group
1269 if 'checked' in el.attrib])
1270
1271 - def add(self, value):
1272 for el in self.group:
1273 if el.get('value') == value:
1274 el.set('checked', '')
1275 break
1276 else:
1277 raise KeyError("No checkbox with value %r" % value)
1278
1280 for el in self.group:
1281 if el.get('value') == value:
1282 if 'checked' in el.attrib:
1283 del el.attrib['checked']
1284 else:
1285 raise KeyError(
1286 "The checkbox with value %r was already unchecked" % value)
1287 break
1288 else:
1289 raise KeyError(
1290 "No checkbox with value %r" % value)
1291
1293 return '<%s {%s} for checkboxes name=%r>' % (
1294 self.__class__.__name__,
1295 ', '.join([repr(v) for v in self]),
1296 self.group.name)
1297
1381
1382 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1383
1385 """
1386 Represents a ``<label>`` element.
1387
1388 Label elements are linked to other elements with their ``for``
1389 attribute. You can access this element with ``label.for_element``.
1390 """
1391
1393 """
1394 Get/set the element this label points to. Return None if it
1395 can't be found.
1396 """
1397 id = self.get('for')
1398 if not id:
1399 return None
1400 return self.body.get_element_by_id(id)
1402 id = other.get('id')
1403 if not id:
1404 raise TypeError(
1405 "Element %r has no id attribute" % other)
1406 self.set('for', id)
1410 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1411 doc=_for_element__get.__doc__)
1412
1413 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1414
1415
1416
1417
1418
1420 """Convert all tags in an HTML tree to XHTML by moving them to the
1421 XHTML namespace.
1422 """
1423 try:
1424 html = html.getroot()
1425 except AttributeError:
1426 pass
1427 prefix = "{%s}" % XHTML_NAMESPACE
1428 for el in html.iter():
1429 tag = el.tag
1430 if isinstance(tag, basestring):
1431 if tag[0] != '{':
1432 el.tag = prefix + tag
1433
1435 """Convert all tags in an XHTML tree to HTML by removing their
1436 XHTML namespace.
1437 """
1438 try:
1439 xhtml = xhtml.getroot()
1440 except AttributeError:
1441 pass
1442 prefix = "{%s}" % XHTML_NAMESPACE
1443 prefix_len = len(prefix)
1444 for el in xhtml.iter(prefix + "*"):
1445 el.tag = el.tag[prefix_len:]
1446
1447
1448
1449 __str_replace_meta_content_type = re.compile(
1450 r'<meta http-equiv="Content-Type"[^>]*>').sub
1451 __bytes_replace_meta_content_type = re.compile(
1452 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1453
1454 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1455 encoding=None, method="html"):
1456 """Return an HTML string representation of the document.
1457
1458 Note: if include_meta_content_type is true this will create a
1459 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1460 regardless of the value of include_meta_content_type any existing
1461 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1462
1463 The ``encoding`` argument controls the output encoding (defauts to
1464 ASCII, with &#...; character references for any characters outside
1465 of ASCII).
1466
1467 The ``method`` argument defines the output method. It defaults to
1468 'html', but can also be 'xml' for xhtml output, or 'text' to
1469 serialise to plain text without markup. Note that you can pass
1470 the builtin ``unicode`` type as ``encoding`` argument to serialise
1471 to a unicode string.
1472
1473 Example::
1474
1475 >>> from lxml import html
1476 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1477
1478 >>> html.tostring(root)
1479 b'<p>Hello<br>world!</p>'
1480 >>> html.tostring(root, method='html')
1481 b'<p>Hello<br>world!</p>'
1482
1483 >>> html.tostring(root, method='xml')
1484 b'<p>Hello<br/>world!</p>'
1485
1486 >>> html.tostring(root, method='text')
1487 b'Helloworld!'
1488
1489 >>> html.tostring(root, method='text', encoding=unicode)
1490 u'Helloworld!'
1491 """
1492 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1493 encoding=encoding)
1494 if method == 'html' and not include_meta_content_type:
1495 if isinstance(html, str):
1496 html = __str_replace_meta_content_type('', html)
1497 else:
1498 html = __bytes_replace_meta_content_type(bytes(), html)
1499 return html
1500
1501 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1502
1504 """
1505 Open the HTML document in a web browser, saving it to a temporary
1506 file to open it. Note that this does not delete the file after
1507 use. This is mainly meant for debugging.
1508 """
1509 import os
1510 import webbrowser
1511 import tempfile
1512 if not isinstance(doc, etree._ElementTree):
1513 doc = etree.ElementTree(doc)
1514 handle, fn = tempfile.mkstemp(suffix='.html')
1515 f = os.fdopen(handle, 'wb')
1516 try:
1517 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1518 finally:
1519
1520 f.close()
1521 url = 'file://' + fn.replace(os.path.sep, '/')
1522 print(url)
1523 webbrowser.open(url)
1524
1525
1526
1527
1528
1530 """An HTML parser that is configured to return lxml.html Element
1531 objects.
1532 """
1536
1538 """An XML parser that is configured to return lxml.html Element
1539 objects.
1540
1541 Note that this parser is not really XHTML aware unless you let it
1542 load a DTD that declares the HTML entities. To do this, make sure
1543 you have the XHTML DTDs installed in your catalogs, and create the
1544 parser like this::
1545
1546 >>> parser = XHTMLParser(load_dtd=True)
1547
1548 If you additionally want to validate the document, use this::
1549
1550 >>> parser = XHTMLParser(dtd_validation=True)
1551
1552 For catalog support, see http://www.xmlsoft.org/catalog.html.
1553 """
1557
1559 """Create a new HTML Element.
1560
1561 This can also be used for XHTML documents.
1562 """
1563 v = html_parser.makeelement(*args, **kw)
1564 return v
1565
1566 html_parser = HTMLParser()
1567 xhtml_parser = XHTMLParser()
1568