| Home | Trees | Indices | Help |
|
|---|
|
|
1 import threading
2 import re
3 import urlparse
4 import copy
5 from lxml import etree
6 from lxml.html import defs
7 from lxml import cssselect
8 from lxml.html.setmixin import SetMixin
9 try:
10 from UserDict import DictMixin
11 except ImportError:
12 # DictMixin was introduced in Python 2.4
13 from lxml.html._dictmixin import DictMixin
14 import sets
15
16 __all__ = [
17 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
18 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
19 'find_rel_links', 'find_class', 'make_links_absolute',
20 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
21
22 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
23 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
24 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
25 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
26 _collect_string_content = etree.XPath("string()")
27 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
28 _css_import_re = re.compile(r'@import "(.*?)"')
29 _label_xpath = etree.XPath("//label[@for=$id]")
30 _archive_re = re.compile(r'[^ ]+')
31
33
35 """
36 Returns the base URL, given when the page was parsed.
37
38 Use with ``urlparse.urljoin(el.base_url, href)`` to get
39 absolute URLs.
40 """
41 return self.getroottree().docinfo.URL
42 base_url = property(base_url, doc=base_url.__doc__)
43
49 forms = property(forms, doc=forms.__doc__)
50
52 """
53 Return the <body> element. Can be called from a child element
54 to get the document's head.
55 """
56 return self.xpath('//body')[0]
57 body = property(body, doc=body.__doc__)
58
60 """
61 Returns the <head> element. Can be called from a child
62 element to get the document's head.
63 """
64 return self.xpath('//head')[0]
65 head = property(head, doc=head.__doc__)
66
68 """
69 Get or set any <label> element associated with this element.
70 """
71 id = self.get('id')
72 if not id:
73 return None
74 result = _label_xpath(self, id=id)
75 if not result:
76 return None
77 else:
78 return result[0]
80 id = self.get('id')
81 if not id:
82 raise TypeError(
83 "You cannot set a label for an element (%r) that has no id"
84 % self)
85 if not label.tag == 'label':
86 raise TypeError(
87 "You can only assign label to a label element (not %r)"
88 % label)
89 label.set('for', id)
94 label = property(label__get, label__set, label__del, doc=label__get.__doc__)
95
97 """
98 Removes this element from the tree, including its children and
99 text. The tail text is joined to the previous element or
100 parent.
101 """
102 parent = self.getparent()
103 assert parent is not None
104 if self.tail:
105 previous = self.getprevious()
106 if previous is None:
107 parent.text = (parent.text or '') + self.tail
108 else:
109 previous.tail = (previous.tail or '') + self.tail
110 parent.remove(self)
111
113 """
114 Remove the tag, but not its children or text. The children and text
115 are merged into the parent.
116
117 Example::
118
119 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
120 >>> h.find('//b').drop_tag()
121 >>> print tostring(h)
122 <div>Hello World!</div>
123 """
124 parent = self.getparent()
125 assert parent is not None
126 previous = self.getprevious()
127 if self.text and isinstance(self.tag, basestring):
128 # not a Comment, etc.
129 if previous is None:
130 parent.text = (parent.text or '') + self.text
131 else:
132 previous.tail = (previous.tail or '') + self.text
133 if self.tail:
134 if len(self):
135 last = self[-1]
136 last.tail = (last.tail or '') + self.tail
137 elif previous is None:
138 parent.text = (parent.text or '') + self.tail
139 else:
140 previous.tail = (previous.tail or '') + self.tail
141 index = parent.index(self)
142 parent[index:index+1] = self[:]
143
145 """
146 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
147 """
148 rel = rel.lower()
149 return [el for el in _rel_links_xpath(self)
150 if el.get('rel').lower() == rel]
151
153 """
154 Find any elements with the given class name.
155 """
156 return _class_xpath(self, class_name=class_name)
157
159 """
160 Get the first element in a document with the given id. If none is
161 found, return the default argument if provided or raise KeyError
162 otherwise.
163
164 Note that there can be more than one element with the same id,
165 and this isn't uncommon in HTML documents found in the wild.
166 Browsers return only the first match, and this function does
167 the same.
168 """
169 try:
170 # FIXME: should this check for multiple matches?
171 # browsers just return the first one
172 return _id_xpath(self, id=id)[0]
173 except IndexError:
174 if default:
175 return default[0]
176 else:
177 raise KeyError, id
178
180 """
181 Return the text content of the tag (and the text in any children).
182 """
183 return _collect_string_content(self)
184
186 """
187 Run the CSS expression on this element and its children,
188 returning a list of the results.
189
190 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
191 that pre-compiling the expression can provide a substantial
192 speedup.
193 """
194 return cssselect.CSSSelect(expr)(self)
195
196 ########################################
197 ## Link functions
198 ########################################
199
201 """
202 Make all links in the document absolute, given the
203 ``base_url`` for the document (the full URL where the document
204 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
205
206 If ``resolve_base_href`` is true, then any ``<base href>``
207 tags in the document are used *and* removed from the document.
208 If it is false then any such tag is ignored.
209 """
210 if base_url is None:
211 base_url = self.base_url
212 if base_url is None:
213 raise TypeError(
214 "No base_url given, and the document has no base_url")
215 if resolve_base_href:
216 self.resolve_base_href()
217 def link_repl(href):
218 return urlparse.urljoin(base_url, href)
219 self.rewrite_links(link_repl)
220
222 """
223 Find any ``<base href>`` tag in the document, and apply its
224 values to all links found in the document. Also remove the
225 tag once it has been applied.
226 """
227 base_href = None
228 basetags = self.xpath('//base[@href]')
229 for b in basetags:
230 base_href = b.get('href')
231 b.drop_tree()
232 if not base_href:
233 return
234 self.make_links_absolute(base_href, resolve_base_href=False)
235
237 """
238 Yield (element, attribute, link, pos), where attribute may be None
239 (indicating the link is in the text). ``pos`` is the position
240 where the link occurs; often 0, but sometimes something else in
241 the case of links in stylesheets or style tags.
242
243 Note: <base href> is *not* taken into account in any way. The
244 link you get is exactly the link in the document.
245 """
246 link_attrs = defs.link_attrs
247 for el in self.getiterator():
248 attribs = el.attrib
249 if el.tag != 'object':
250 for attrib in link_attrs:
251 if attrib in attribs:
252 yield (el, attrib, attribs[attrib], 0)
253 elif el.tag == 'object':
254 codebase = None
255 ## <object> tags have attributes that are relative to
256 ## codebase
257 if 'codebase' in attribs:
258 codebase = el.get('codebase')
259 yield (el, 'codebase', codebase, 0)
260 for attrib in 'classid', 'data':
261 if attrib in attribs:
262 value = el.get(attrib)
263 if codebase is not None:
264 value = urlparse.urljoin(codebase, value)
265 yield (el, attrib, value, 0)
266 if 'archive' in attribs:
267 for match in _archive_re.finditer(el.get('archive')):
268 value = match.group(0)
269 if codebase is not None:
270 value = urlparse.urljoin(codebase, value)
271 yield (el, 'archive', value, match.start())
272 if el.tag == 'param':
273 valuetype = el.get('valuetype') or ''
274 if valuetype.lower() == 'ref':
275 ## FIXME: while it's fine we *find* this link,
276 ## according to the spec we aren't supposed to
277 ## actually change the value, including resolving
278 ## it. It can also still be a link, even if it
279 ## doesn't have a valuetype="ref" (which seems to be the norm)
280 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
281 yield (el, 'value', el.get('value'), 0)
282 if el.tag == 'style' and el.text:
283 for match in _css_url_re.finditer(el.text):
284 yield (el, None, match.group(1), match.start(1))
285 for match in _css_import_re.finditer(el.text):
286 yield (el, None, match.group(1), match.start(1))
287 if 'style' in attribs:
288 for match in _css_url_re.finditer(attribs['style']):
289 yield (el, 'style', match.group(1), match.start(1))
290
293 """
294 Rewrite all the links in the document. For each link
295 ``link_repl_func(link)`` will be called, and the return value
296 will replace the old link.
297
298 Note that links may not be absolute (unless you first called
299 ``make_links_absolute()``), and may be internal (e.g.,
300 ``'#anchor'``). They can also be values like
301 ``'mailto:email'`` or ``'javascript:expr'``.
302
303 If you give ``base_href`` then all links passed to
304 ``link_repl_func()`` will take that into account.
305
306 If the ``link_repl_func`` returns None, the attribute or
307 tag text will be removed completely.
308 """
309 if base_href is not None:
310 # FIXME: this can be done in one pass with a wrapper
311 # around link_repl_func
312 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
313 elif resolve_base_href:
314 self.resolve_base_href()
315 for el, attrib, link, pos in self.iterlinks():
316 new_link = link_repl_func(link)
317 if new_link == link:
318 continue
319 if new_link is None:
320 # Remove the attribute or element content
321 if attrib is None:
322 el.text = ''
323 else:
324 del el.attrib[attrib]
325 continue
326 if attrib is None:
327 new = el.text[:pos] + new_link + el.text[pos+len(link):]
328 el.text = new
329 else:
330 cur = el.attrib[attrib]
331 if not pos and len(cur) == len(link):
332 # Most common case
333 el.attrib[attrib] = new_link
334 else:
335 new = cur[:pos] + new_link + cur[pos+len(link):]
336 el.attrib[attrib] = new
337
338
340 """
341 An object that represents a method on an element as a function;
342 the function takes either an element or an HTML string. It
343 returns whatever the function normally returns, or if the function
344 works in-place (and so returns None) it returns a serialized form
345 of the resulting document.
346 """
348 self.name = name
349 self.copy = copy
350 self.__doc__ = getattr(source_class, self.name).__doc__
352 if isinstance(doc, basestring):
353 if 'copy' in kw:
354 raise TypeError(
355 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
356 return_string = True
357 doc = fromstring(doc, **kw)
358 else:
359 if 'copy' in kw:
360 copy = kw.pop('copy')
361 else:
362 copy = self.copy
363 return_string = False
364 if copy:
365 doc = copy.deepcopy(doc)
366 meth = getattr(doc, self.name)
367 result = meth(*args, **kw)
368 # FIXME: this None test is a bit sloppy
369 if result is None:
370 # Then return what we got in
371 if return_string:
372 return tostring(doc)
373 else:
374 return doc
375 else:
376 return result
377
378 find_rel_links = _MethodFunc('find_rel_links', copy=False)
379 find_class = _MethodFunc('find_class', copy=False)
380 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
381 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
382 iterlinks = _MethodFunc('iterlinks', copy=False)
383 rewrite_links = _MethodFunc('rewrite_links', copy=True)
384
387
390
393
396
397
399 """A lookup scheme for HTML Element classes.
400
401 To create a lookup instance with different Element classes, pass a tag
402 name mapping of Element classes in the ``classes`` keyword argument and/or
403 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
404 The special key '*' denotes a Mixin class that should be mixed into all
405 Element classes.
406 """
407 _default_element_classes = {}
408
410 etree.CustomElementClassLookup.__init__(self)
411 if classes is None:
412 classes = self._default_element_classes.copy()
413 if mixins:
414 mixers = {}
415 for name, value in mixins:
416 if name == '*':
417 for n in classes.keys():
418 mixers.setdefault(n, []).append(value)
419 else:
420 mixers.setdefault(name, []).append(value)
421 for name, mix_bases in mixers.items():
422 cur = classes.get(name, HtmlElement)
423 bases = tuple(mix_bases + [cur])
424 classes[name] = type(cur.__name__, bases, {})
425 self._element_classes = classes
426
428 if node_type == 'element':
429 return self._element_classes.get(name.lower(), HtmlElement)
430 elif node_type == 'comment':
431 return HtmlComment
432 elif node_type == 'PI':
433 return HtmlProcessingInstruction
434 elif node_type == 'entity':
435 return HtmlEntity
436 # Otherwise normal lookup
437 return None
438
439 ################################################################################
440 # parsing
441 ################################################################################
442
444 value = etree.HTML(html, html_parser, **kw)
445 if value is None:
446 raise etree.ParserError(
447 "Document is empty")
448 return value
449
451 """
452 Parses several HTML elements, returning a list of elements.
453
454 The first item in the list may be a string (though leading
455 whitespace is removed). If no_leading_text is true, then it will
456 be an error if there is leading text, and it will always be a list
457 of only elements.
458 """
459 # FIXME: check what happens when you give html with a body, head, etc.
460 start = html[:20].lstrip().lower()
461 if not start.startswith('<html') and not start.startswith('<!doctype'):
462 html = '<html><body>%s</body></html>' % html
463 doc = document_fromstring(html, **kw)
464 assert doc.tag == 'html'
465 bodies = [e for e in doc if e.tag == 'body']
466 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
467 body = bodies[0]
468 elements = []
469 if no_leading_text and body.text and body.text.strip():
470 raise etree.ParserError(
471 "There is leading text: %r" % body.text)
472 if body.text and body.text.strip():
473 elements.append(body.text)
474 elements.extend(body)
475 # FIXME: removing the reference to the parent artificial document
476 # would be nice
477 return elements
478
480 """
481 Parses a single HTML element; it is an error if there is more than
482 one element, or if anything but whitespace precedes or follows the
483 element.
484
485 If create_parent is true (or is a tag name) then a parent node
486 will be created to encapsulate the HTML in a single element.
487 """
488 if create_parent:
489 if not isinstance(create_parent, basestring):
490 create_parent = 'div'
491 return fragment_fromstring('<%s>%s</%s>' % (
492 create_parent, html, create_parent), **kw)
493 elements = fragments_fromstring(html, no_leading_text=True)
494 if not elements:
495 raise etree.ParserError(
496 "No elements found")
497 if len(elements) > 1:
498 raise etree.ParserError(
499 "Multiple elements found (%s)"
500 % ', '.join([_element_name(e) for e in elements]))
501 el = elements[0]
502 if el.tail and el.tail.strip():
503 raise etree.ParserError(
504 "Element followed by text: %r" % el.tail)
505 el.tail = None
506 return el
507
509 """
510 Parse the html, returning a single element/document.
511
512 This tries to minimally parse the chunk of text, without knowing if it
513 is a fragment or a document.
514 """
515 start = html[:10].lstrip().lower()
516 if start.startswith('<html') or start.startswith('<!doctype'):
517 # Looks like a full HTML document
518 return document_fromstring(html, **kw)
519 # otherwise, lets parse it out...
520 doc = document_fromstring(html, **kw)
521 bodies = doc.findall('body')
522 if bodies:
523 body = bodies[0]
524 if len(bodies) > 1:
525 # Somehow there are multiple bodies, which is bad, but just
526 # smash them into one body
527 for other_body in bodies[1:]:
528 if other_body.text:
529 if len(body):
530 body[-1].tail = (body[-1].tail or '') + other_body.text
531 else:
532 body.text = (body.text or '') + other_body.text
533 body.extend(other_body)
534 # We'll ignore tail
535 # I guess we are ignoring attributes too
536 other_body.drop_tree()
537 else:
538 body = None
539 heads = doc.findall('head')
540 if heads:
541 # Well, we have some sort of structure, so lets keep it all
542 head = heads[0]
543 if len(heads) > 1:
544 for other_head in heads[1:]:
545 head.extend(other_head)
546 # We don't care about text or tail in a head
547 other_head.drop_tree()
548 return doc
549 if (len(body) == 1 and (not body.text or not body.text.strip())
550 and (not body[-1].tail or not body[-1].tail.strip())):
551 # The body has just one element, so it was probably a single
552 # element passed in
553 return body[0]
554 # Now we have a body which represents a bunch of tags which have the
555 # content that was passed in. We will create a fake container, which
556 # is the body tag, except <body> implies too much structure.
557 if _contains_block_level_tag(body):
558 body.tag = 'div'
559 else:
560 body.tag = 'span'
561 return body
562
564 """
565 Parse a filename, URL, or file-like object into an HTML document.
566
567 You may pass the keyword argument ``base_url='http://...'`` to set
568 the base URL.
569 """
570 if parser is None:
571 parser = html_parser
572 return etree.parse(filename, parser, **kw)
573
575 # FIXME: I could do this with XPath, but would that just be
576 # unnecessarily slow?
577 for el in el.getiterator():
578 if el.tag in defs.block_tags:
579 return True
580 return False
581
583 if isinstance(el, etree.CommentBase):
584 return 'comment'
585 elif isinstance(el, basestring):
586 return 'string'
587 else:
588 return el.tag
589
590 ################################################################################
591 # form handling
592 ################################################################################
593
595 """
596 Represents a <form> element.
597 """
598
600 """
601 Returns an accessor for all the input elements in the form.
602
603 See `InputGetter` for more information about the object.
604 """
605 return InputGetter(self)
606 inputs = property(inputs, doc=inputs.__doc__)
607
609 """
610 Dictionary-like object that represents all the fields in this
611 form. You can set values in this dictionary to effect the
612 form.
613 """
614 return FieldsDict(self.inputs)
616 prev_keys = self.fields.keys()
617 for key, value in value.iteritems():
618 if key in prev_keys:
619 prev_keys.remove(key)
620 self.fields[key] = value
621 for key in prev_keys:
622 if key is None:
623 # Case of an unnamed input; these aren't really
624 # expressed in form_values() anyway.
625 continue
626 self.fields[key] = None
627
628 fields = property(fields__get, fields__set, doc=fields__get.__doc__)
629
631 if self.get('name'):
632 return self.get('name')
633 elif self.get('id'):
634 return '#' + self.get('id')
635 return str(self.body.findall('form').index(self))
636
638 """
639 Return a list of tuples of the field values for the form.
640 This is suitable to be passed to ``urllib.urlencode()``.
641 """
642 results = []
643 for el in self.inputs:
644 name = el.name
645 if not name:
646 continue
647 if el.tag == 'textarea':
648 results.append((name, el.value))
649 elif el.tag == 'select':
650 value = el.value
651 if el.multiple:
652 for v in value:
653 results.append((name, v))
654 elif value is not None:
655 results.append((name, el.value))
656 else:
657 assert el.tag == 'input', (
658 "Unexpected tag: %r" % el)
659 if el.checkable and not el.checked:
660 continue
661 if el.type in ('submit', 'image', 'reset'):
662 continue
663 value = el.value
664 if value is not None:
665 results.append((name, el.value))
666 return results
667
669 """
670 Get/set the form's ``action`` attribute.
671 """
672 base_url = self.base_url
673 action = self.get('action')
674 if base_url and action is not None:
675 return urlparse.urljoin(base_url, action)
676 else:
677 return action
683 action = property(action__get, action__set, action__del, doc=action__get.__doc__)
684
686 """
687 Get/set the form's method. Always returns a capitalized
688 string, and defaults to ``'GET'``
689 """
690 return self.get('method', 'GET').upper()
693 method = property(method__get, method__set, doc=method__get.__doc__)
694
695 HtmlElementClassLookup._default_element_classes['form'] = FormElement
696
698 """
699 Helper function to submit a form. Returns a file-like object, as from
700 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
701 which shows the URL if there were any redirects.
702
703 You can use this like::
704
705 >>> form = doc.forms[0]
706 >>> form.inputs['foo'].value = 'bar' # etc
707 >>> response = form.submit()
708 >>> doc = parse(response)
709 >>> doc.make_links_absolute(response.geturl())
710
711 To change the HTTP requester, pass a function as ``open_http`` keyword
712 argument that opens the URL for you. The function must have the following
713 signature::
714
715 open_http(method, URL, values)
716
717 The action is one of 'GET' or 'POST', the URL is the target URL as a
718 string, and the values are a sequence of ``(name, value)`` tuples with the
719 form data.
720 """
721 values = form.form_values()
722 if extra_values:
723 if hasattr(extra_values, 'items'):
724 extra_values = extra_values.items()
725 values.extend(extra_values)
726 if open_http is None:
727 open_http = open_http_urllib
728 return open_http(form.method, form.action, values)
729
731 import urllib
732 ## FIXME: should test that it's not a relative URL or something
733 if method == 'GET':
734 if '?' in url:
735 url += '&'
736 else:
737 url += '?'
738 url += urllib.urlencode(values)
739 data = None
740 else:
741 data = urllib.urlencode(values)
742 return urllib.urlopen(url, data)
743
764
766
767 """
768 An accessor that represents all the input fields in a form.
769
770 You can get fields by name from this, with
771 ``form.inputs['field_name']``. If there are a set of checkboxes
772 with the same name, they are returned as a list (a `CheckboxGroup`
773 which also allows value setting). Radio inputs are handled
774 similarly.
775
776 You can also iterate over this to get all input elements. This
777 won't return the same thing as if you get all the names, as
778 checkboxes and radio elements are returned individually.
779 """
780
781 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]")
782 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']")
783
786
791
792 ## FIXME: there should be more methods, and it's unclear if this is
793 ## a dictionary-like object or list-like object
794
796 results = self._name_xpath(self.form, name=name)
797 if results:
798 type = results[0].get('type')
799 if type == 'radio' and len(results) > 1:
800 group = RadioGroup(results)
801 group.name = name
802 return group
803 elif type == 'checkbox' and len(results) > 1:
804 group = CheckboxGroup(results)
805 group.name = name
806 return group
807 else:
808 # I don't like throwing away elements like this
809 return results[0]
810 else:
811 raise KeyError(
812 "No input element with the name %r" % name)
813
817
823
825 ## FIXME: kind of dumb to turn a list into an iterator, only
826 ## to have it likely turned back into a list again :(
827 return iter(self._all_xpath(self.form))
828
830
831 """
832 Mix-in for all input elements (input, select, and textarea)
833 """
834
835
846 name = property(name__get, name__set, name__del, doc=name__get.__doc__)
847
856
858 """
859 ``<textarea>`` element. You can get the name with ``.name`` and
860 get/set the value with ``.value``
861 """
862
864 """
865 Get/set the value (which is the contents of this element)
866 """
867 return self.text or ''
871 self.text = ''
872 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
873
874 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
875
877 """
878 ``<select>`` element. You can get the name with ``.name``.
879
880 ``.value`` will be the value of the selected option, unless this
881 is a multi-select element (``<select multiple>``), in which case
882 it will be a set-like object. In either case ``.value_options``
883 gives the possible values.
884
885 The boolean attribute ``.multiple`` shows if this is a
886 multi-select.
887 """
888
890 """
891 Get/set the value of this select (the selected option).
892
893 If this is a multi-select, this is a set-like object that
894 represents all the selected options.
895 """
896 if self.multiple:
897 return MultipleSelectOptions(self)
898 for el in self.getiterator('option'):
899 if 'selected' in el.attrib:
900 value = el.get('value')
901 # FIXME: If value is None, what to return?, get_text()?
902 return value
903 return None
904
906 if self.multiple:
907 if isinstance(value, basestring):
908 raise TypeError(
909 "You must pass in a sequence")
910 self.value.clear()
911 self.value.update(value)
912 return
913 if value is not None:
914 for el in self.getiterator('option'):
915 # FIXME: also if el.get('value') is None?
916 if el.get('value') == value:
917 checked_option = el
918 break
919 else:
920 raise ValueError(
921 "There is no option with the value of %r" % value)
922 for el in self.getiterator('option'):
923 if 'selected' in el.attrib:
924 del el.attrib['selected']
925 if value is not None:
926 checked_option.set('selected', '')
927
929 # FIXME: should del be allowed at all?
930 if self.multiple:
931 self.value.clear()
932 else:
933 self.value = None
934
935 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
936
938 """
939 All the possible values this select can have (the ``value``
940 attribute of all the ``<option>`` elements.
941 """
942 return [el.get('value') for el in self.getiterator('option')]
943 value_options = property(value_options, doc=value_options.__doc__)
944
946 """
947 Boolean attribute: is there a ``multiple`` attribute on this element.
948 """
949 return 'multiple' in self.attrib
951 if value:
952 self.set('multiple', '')
953 elif 'multiple' in self.attrib:
954 del self.attrib['multiple']
955 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
956
957 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
958
960 """
961 Represents all the selected options in a ``<select multiple>`` element.
962
963 You can add to this set-like option to select an option, or remove
964 to unselect the option.
965 """
966
969
971 """
972 Iterator of all the ``<option>`` elements.
973 """
974 return self.select.getiterator('option')
975 options = property(options)
976
980
982 for option in self.options:
983 if option.get('value') == item:
984 option.set('selected', '')
985 break
986 else:
987 raise ValueError(
988 "There is no option with the value %r" % item)
989
991 for option in self.options:
992 if option.get('value') == item:
993 if 'selected' in option.attrib:
994 del option.attrib['selected']
995 else:
996 raise ValueError(
997 "The option %r is not currently selected" % item)
998 break
999 else:
1000 raise ValueError(
1001 "There is not option with the value %r" % item)
1002
1008
1010 """
1011 This object represents several ``<input type=radio>`` elements
1012 that have the same name.
1013
1014 You can use this like a list, but also use the property
1015 ``.value`` to check/uncheck inputs. Also you can use
1016 ``.value_options`` to get the possible values.
1017 """
1018
1020 """
1021 Get/set the value, which checks the radio with that value (and
1022 unchecks any other value).
1023 """
1024 for el in self:
1025 if 'checked' in el.attrib:
1026 return el.get('value')
1027 return None
1028
1030 if value is not None:
1031 for el in self:
1032 if el.get('value') == value:
1033 checked_option = el
1034 break
1035 else:
1036 raise ValueError(
1037 "There is no radio input with the value %r" % value)
1038 for el in self:
1039 if 'checked' in el.attrib:
1040 del el.attrib['checked']
1041 if value is not None:
1042 checked_option.set('checked', '')
1043
1045 self.value = None
1046
1047 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1048
1050 """
1051 Returns a list of all the possible values.
1052 """
1053 return [el.get('value') for el in self]
1054 value_options = property(value_options, doc=value_options.__doc__)
1055
1060
1062 """
1063 Represents a group of checkboxes (``<input type=checkbox>``) that
1064 have the same name.
1065
1066 In addition to using this like a list, the ``.value`` attribute
1067 returns a set-like object that you can add to or remove from to
1068 check and uncheck checkboxes. You can also use ``.value_options``
1069 to get the possible values.
1070 """
1071
1073 """
1074 Return a set-like object that can be modified to check or
1075 uncheck individual checkboxes according to their value.
1076 """
1077 return CheckboxValues(self)
1079 self.value.clear()
1080 if not hasattr(value, '__iter__'):
1081 raise ValueError(
1082 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1083 % (self[0].name, value))
1084 self.value.update(value)
1087 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1088
1092
1094
1095 """
1096 Represents the values of the checked checkboxes in a group of
1097 checkboxes with the same name.
1098 """
1099
1102
1108
1110 for el in self.group:
1111 if el.get('value') == value:
1112 el.set('checked', '')
1113 break
1114 else:
1115 raise KeyError("No checkbox with value %r" % value)
1116
1118 for el in self.group:
1119 if el.get('value') == value:
1120 if 'checked' in el.attrib:
1121 del el.attrib['checked']
1122 else:
1123 raise KeyError(
1124 "The checkbox with value %r was already unchecked" % value)
1125 break
1126 else:
1127 raise KeyError(
1128 "No checkbox with value %r" % value)
1129
1135
1137 """
1138 Represents an ``<input>`` element.
1139
1140 You can get the type with ``.type`` (which is lower-cased and
1141 defaults to ``'text'``).
1142
1143 Also you can get and set the value with ``.value``
1144
1145 Checkboxes and radios have the attribute ``input.checkable ==
1146 True`` (for all others it is false) and a boolean attribute
1147 ``.checked``.
1148
1149 """
1150
1151 ## FIXME: I'm a little uncomfortable with the use of .checked
1153 """
1154 Get/set the value of this element, using the ``value`` attribute.
1155
1156 Also, if this is a checkbox and it has no value, this defaults
1157 to ``'on'``. If it is a checkbox or radio that is not
1158 checked, this returns None.
1159 """
1160 if self.checkable:
1161 if self.checked:
1162 return self.get('value') or 'on'
1163 else:
1164 return None
1165 return self.get('value')
1167 if self.checkable:
1168 if not value:
1169 self.checked = False
1170 else:
1171 self.checked = True
1172 if isinstance(value, basestring):
1173 self.set('value', value)
1174 else:
1175 self.set('value', value)
1177 if self.checkable:
1178 self.checked = False
1179 else:
1180 if 'value' in self.attrib:
1181 del self.attrib['value']
1182 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1183
1185 """
1186 Return the type of this element (using the type attribute).
1187 """
1188 return self.get('type', 'text').lower()
1191 type = property(type__get, type__set, doc=type__get.__doc__)
1192
1194 """
1195 Boolean: can this element be checked?
1196 """
1197 return self.type in ['checkbox', 'radio']
1198 checkable = property(checkable__get, doc=checkable__get.__doc__)
1199
1201 """
1202 Boolean attribute to get/set the presence of the ``checked``
1203 attribute.
1204
1205 You can only use this on checkable input types.
1206 """
1207 if not self.checkable:
1208 raise AttributeError('Not a checkable input type')
1209 return 'checked' in self.attrib
1211 if not self.checkable:
1212 raise AttributeError('Not a checkable input type')
1213 if value:
1214 self.set('checked', '')
1215 else:
1216 if 'checked' in self.attrib:
1217 del self.attrib['checked']
1218 checked = property(checked__get, checked__set, doc=checked__get.__doc__)
1219
1220 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1221
1223 """
1224 Represents a ``<label>`` element.
1225
1226 Label elements are linked to other elements with their ``for``
1227 attribute. You can access this element with ``label.for_element``.
1228 """
1229
1231 """
1232 Get/set the element this label points to. Return None if it
1233 can't be found.
1234 """
1235 id = self.get('for')
1236 if not id:
1237 return None
1238 return self.body.get_element_by_id(id)
1240 id = other.get('id')
1241 if not id:
1242 raise TypeError(
1243 "Element %r has no id attribute" % other)
1244 self.set('for', id)
1248 for_element = property(for_element__get, for_element__set, for_element__del,
1249 doc=for_element__get.__doc__)
1250
1251 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1252
1253 ############################################################
1254 ## Serialization
1255 ############################################################
1256
1257 # This isn't a general match, but it's a match for what libxml2
1258 # specifically serialises:
1259 __replace_meta_content_type = re.compile(
1260 r'<meta http-equiv="Content-Type".*?>').sub
1261
1263 """
1264 return HTML string representation of the document given
1265
1266 note: this will create a meta http-equiv="Content" tag in the head
1267 and may replace any that are present
1268 """
1269 assert doc is not None
1270 html = etree.tostring(doc, method="html", pretty_print=pretty_print)
1271 if not include_meta_content_type:
1272 html = __replace_meta_content_type('', html)
1273 return html
1274
1276 """
1277 Open the HTML document in a web browser (saving it to a temporary
1278 file to open it).
1279 """
1280 import os
1281 import webbrowser
1282 try:
1283 write_doc = doc.write
1284 except AttributeError:
1285 write_doc = etree.ElementTree(element=doc).write
1286 fn = os.tempnam() + '.html'
1287 write_doc(fn, method="html")
1288 url = 'file://' + fn.replace(os.path.sep, '/')
1289 print url
1290 webbrowser.open(url)
1291
1292 ################################################################################
1293 # configure Element class lookup
1294 ################################################################################
1295
1298 super(HTMLParser, self).__init__(**kwargs)
1299 self.setElementClassLookup(HtmlElementClassLookup())
1300
1304
1305 html_parser = HTMLParser()
1306
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Sun Nov 25 11:48:45 2007 | http://epydoc.sourceforge.net |