| |
-
-- builtins.Exception(builtins.BaseException)
-
-
-
-- AdvancedHTMLParser.exceptions.HTMLValidationException
-
-
-
-- AdvancedHTMLParser.exceptions.InvalidCloseException
-
- AdvancedHTMLParser.exceptions.MissedCloseException
-
-
-- AdvancedHTMLParser.exceptions.MultipleRootNodeException
-
-
-- builtins.list(builtins.object)
-
-
-
-- AdvancedHTMLParser.Tags.TagCollection
-
-
-- builtins.object
-
-
-
-- AdvancedHTMLParser.SpecialAttributes.StyleAttribute
-
- AdvancedHTMLParser.Tags.AdvancedTag
-
-
-- html.parser.HTMLParser(_markupbase.ParserBase)
-
-
-
-- AdvancedHTMLParser.Formatter.AdvancedHTMLFormatter
-
-
-
-- AdvancedHTMLParser.Formatter.AdvancedHTMLMiniFormatter
-
-
-
-- AdvancedHTMLParser.Formatter.AdvancedHTMLSlimTagMiniFormatter
-
-
-- AdvancedHTMLParser.Formatter.AdvancedHTMLSlimTagFormatter
-
-
-- AdvancedHTMLParser.Parser.AdvancedHTMLParser
-
-
-
-- AdvancedHTMLParser.Parser.IndexedAdvancedHTMLParser
-
- AdvancedHTMLParser.Validator.ValidatingAdvancedHTMLParser
-
-
-
-
-
-
-
-
-
-class AdvancedHTMLFormatter(html.parser.HTMLParser) |
-
- |
-AdvancedHTMLFormatter(indent=' ', encoding='utf-8')
-
-A formatter for HTML. Note this does not understand CSS, so if you are enabling preformatted text based on css rules, it will not work.
-It does, however, understand "pre", "code" and "script" tags and will not try to format their contents. |
- |
-- Method resolution order:
-- AdvancedHTMLFormatter
-- html.parser.HTMLParser
-- _markupbase.ParserBase
-- builtins.object
-
-
-Methods defined here:
-- __init__(self, indent=' ', encoding='utf-8')
- Create a pretty formatter.
-
-@param indent <str/int>, Default ' ' [4 spaces] - Either a space/tab/newline that represents one level of indent, or an integer to use that number of spaces
-
-@param encoding <str/None>, Default 'utf-8', - Use this encoding for the document. None to not mess with encoding
-
-- feed(self, contents)
- feed - Load contents
-
-@param contents - HTML contents
-
-- getHTML(self)
- getHTML - Get the full HTML as contained within this tree, converted to valid XHTML
- @returns - String
-
-- getRoot(self)
- getRoot - returns the root Tag
- @return - AdvancedTag at root. If you provided multiple root nodes, this will be a "holder" with tagName value as constants.INVISIBLE_ROOT_TAG
-
-- getRootNodes(self)
- getRootNodes - Gets all objects at the "root" (first level; no parent). Use this if you may have multiple roots (not children of <html>)
- Use this method to get objects, for example, in an AJAX request where <html> may not be your root.
-
- Note: If there are multiple root nodes (i.e. no <html> at the top), getRoot will return a special tag. This function automatically
- handles that, and returns all root nodes.
-
- @return list<AdvancedTag> - A list of AdvancedTags which are at the root level of the tree.
-
-- handle_charref(self, charRef)
- Internal for parsing
-
-- handle_comment(self, comment)
- Internal for parsing
-
-- handle_data(self, data)
- handle_data - Internal for parsing
-
-- handle_decl(self, decl)
- Internal for parsing
-
-- handle_endtag(self, tagName)
- handle_endtag - Internal for parsing
-
-- handle_entityref(self, entity)
- Internal for parsing
-
-- handle_startendtag(self, tagName, attributeList)
- handle_startendtag - Internal for parsing
-
-- handle_starttag(self, tagName, attributeList, isSelfClosing=False)
- handle_starttag - Internal for parsing
-
-- parseFile(self, filename)
- parseFile - Parses a file and creates the DOM tree and indexes
-
- @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
-
-- parseStr(self, html)
- parseStr - Parses a string and creates the DOM tree and indexes.
-
- @param html <str> - valid HTML
-
-- setRoot(self, root)
- setRoot - Sets the root node, and reprocesses the indexes
-
-@param root - AdvancedTag to be new root
-
-- unknown_decl(self, decl)
- Internal for parsing
-
-
-Methods inherited from html.parser.HTMLParser:
-- check_for_whole_start_tag(self, i)
- # Internal -- check to see if we have a complete starttag; return end
-# or -1 if incomplete.
-
-- clear_cdata_mode(self)
-
-- close(self)
- Handle any buffered data.
-
-- get_starttag_text(self)
- Return full source of start tag: '<...>'.
-
-- goahead(self, end)
- # Internal -- handle data as far as reasonable. May leave state
-# and data to be processed by a subsequent call. If 'end' is
-# true, force handling all data as if followed by EOF marker.
-
-- handle_pi(self, data)
- # Overridable -- handle processing instruction
-
-- parse_bogus_comment(self, i, report=1)
- # Internal -- parse bogus comment, return length or -1 if not terminated
-# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
-
-- parse_endtag(self, i)
- # Internal -- parse endtag, return end or -1 if incomplete
-
-- parse_html_declaration(self, i)
- # Internal -- parse html declarations, return length or -1 if not terminated
-# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
-# See also parse_declaration in _markupbase
-
-- parse_pi(self, i)
- # Internal -- parse processing instr, return end or -1 if not terminated
-
-- parse_starttag(self, i)
- # Internal -- handle starttag, return end or -1 if not terminated
-
-- reset(self)
- Reset this instance. Loses all unprocessed data.
-
-- set_cdata_mode(self, elem)
-
-- unescape(self, s)
- # Internal -- helper to remove special character quoting
-
-
-Data and other attributes inherited from html.parser.HTMLParser:
-- CDATA_CONTENT_ELEMENTS = ('script', 'style')
-
-
-Methods inherited from _markupbase.ParserBase:
-- error(self, message)
-
-- getpos(self)
- Return current line number and offset.
-
-- parse_comment(self, i, report=1)
- # Internal -- parse comment, return length or -1 if not terminated
-
-- parse_declaration(self, i)
- # Internal -- parse declaration (for use by subclasses).
-
-- parse_marked_section(self, i, report=1)
- # Internal -- parse a marked section
-# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
-
-- updatepos(self, i, j)
- # Internal -- update line number and offset. This should be
-# called for each piece of data exactly once, in order -- in other
-# words the concatenation of all the input strings to this
-# function should be exactly the entire input.
-
-
-Data descriptors inherited from _markupbase.ParserBase:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
- |
-
-
-
-class AdvancedHTMLMiniFormatter(AdvancedHTMLFormatter) |
-
- |
-AdvancedHTMLMiniFormatter(encoding='utf-8')
-
-AdvancedHTMLMiniFormatter - A formatter that will reformat a document, keeping only functional
- whitespace and removing any and all indentation and nesting spaces. |
- |
-- Method resolution order:
-- AdvancedHTMLMiniFormatter
-- AdvancedHTMLFormatter
-- html.parser.HTMLParser
-- _markupbase.ParserBase
-- builtins.object
-
-
-Methods defined here:
-- __init__(self, encoding='utf-8')
- Create a mini formatter.
-
-@param encoding <str/None>, Default 'utf-8', - Use this encoding for the document. None to not mess with encoding
-
-
-Methods inherited from AdvancedHTMLFormatter:
-- feed(self, contents)
- feed - Load contents
-
-@param contents - HTML contents
-
-- getHTML(self)
- getHTML - Get the full HTML as contained within this tree, converted to valid XHTML
- @returns - String
-
-- getRoot(self)
- getRoot - returns the root Tag
- @return - AdvancedTag at root. If you provided multiple root nodes, this will be a "holder" with tagName value as constants.INVISIBLE_ROOT_TAG
-
-- getRootNodes(self)
- getRootNodes - Gets all objects at the "root" (first level; no parent). Use this if you may have multiple roots (not children of <html>)
- Use this method to get objects, for example, in an AJAX request where <html> may not be your root.
-
- Note: If there are multiple root nodes (i.e. no <html> at the top), getRoot will return a special tag. This function automatically
- handles that, and returns all root nodes.
-
- @return list<AdvancedTag> - A list of AdvancedTags which are at the root level of the tree.
-
-- handle_charref(self, charRef)
- Internal for parsing
-
-- handle_comment(self, comment)
- Internal for parsing
-
-- handle_data(self, data)
- handle_data - Internal for parsing
-
-- handle_decl(self, decl)
- Internal for parsing
-
-- handle_endtag(self, tagName)
- handle_endtag - Internal for parsing
-
-- handle_entityref(self, entity)
- Internal for parsing
-
-- handle_startendtag(self, tagName, attributeList)
- handle_startendtag - Internal for parsing
-
-- handle_starttag(self, tagName, attributeList, isSelfClosing=False)
- handle_starttag - Internal for parsing
-
-- parseFile(self, filename)
- parseFile - Parses a file and creates the DOM tree and indexes
-
- @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
-
-- parseStr(self, html)
- parseStr - Parses a string and creates the DOM tree and indexes.
-
- @param html <str> - valid HTML
-
-- setRoot(self, root)
- setRoot - Sets the root node, and reprocesses the indexes
-
-@param root - AdvancedTag to be new root
-
-- unknown_decl(self, decl)
- Internal for parsing
-
-
-Methods inherited from html.parser.HTMLParser:
-- check_for_whole_start_tag(self, i)
- # Internal -- check to see if we have a complete starttag; return end
-# or -1 if incomplete.
-
-- clear_cdata_mode(self)
-
-- close(self)
- Handle any buffered data.
-
-- get_starttag_text(self)
- Return full source of start tag: '<...>'.
-
-- goahead(self, end)
- # Internal -- handle data as far as reasonable. May leave state
-# and data to be processed by a subsequent call. If 'end' is
-# true, force handling all data as if followed by EOF marker.
-
-- handle_pi(self, data)
- # Overridable -- handle processing instruction
-
-- parse_bogus_comment(self, i, report=1)
- # Internal -- parse bogus comment, return length or -1 if not terminated
-# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
-
-- parse_endtag(self, i)
- # Internal -- parse endtag, return end or -1 if incomplete
-
-- parse_html_declaration(self, i)
- # Internal -- parse html declarations, return length or -1 if not terminated
-# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
-# See also parse_declaration in _markupbase
-
-- parse_pi(self, i)
- # Internal -- parse processing instr, return end or -1 if not terminated
-
-- parse_starttag(self, i)
- # Internal -- handle starttag, return end or -1 if not terminated
-
-- reset(self)
- Reset this instance. Loses all unprocessed data.
-
-- set_cdata_mode(self, elem)
-
-- unescape(self, s)
- # Internal -- helper to remove special character quoting
-
-
-Data and other attributes inherited from html.parser.HTMLParser:
-- CDATA_CONTENT_ELEMENTS = ('script', 'style')
-
-
-Methods inherited from _markupbase.ParserBase:
-- error(self, message)
-
-- getpos(self)
- Return current line number and offset.
-
-- parse_comment(self, i, report=1)
- # Internal -- parse comment, return length or -1 if not terminated
-
-- parse_declaration(self, i)
- # Internal -- parse declaration (for use by subclasses).
-
-- parse_marked_section(self, i, report=1)
- # Internal -- parse a marked section
-# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
-
-- updatepos(self, i, j)
- # Internal -- update line number and offset. This should be
-# called for each piece of data exactly once, in order -- in other
-# words the concatenation of all the input strings to this
-# function should be exactly the entire input.
-
-
-Data descriptors inherited from _markupbase.ParserBase:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
- |
-
-
-
-class AdvancedHTMLParser(html.parser.HTMLParser) |
-
- |
-AdvancedHTMLParser(filename=None, encoding='utf-8')
-
-AdvancedHTMLParser - This class parses and allows searching of documents |
- |
-- Method resolution order:
-- AdvancedHTMLParser
-- html.parser.HTMLParser
-- _markupbase.ParserBase
-- builtins.object
-
-
-Methods defined here:
-- __contains__(self, other)
-
-- __getstate__(self)
- __getstate__ - Get state for pickling
-
- @return <dict>
-
-- __init__(self, filename=None, encoding='utf-8')
- __init__ - Creates an Advanced HTML parser object. For read-only parsing, consider IndexedAdvancedHTMLParser for faster searching.
-
- @param filename <str> - Optional filename to parse. Otherwise use parseFile or parseStr methods.
- @param encoding <str> - Specifies the document encoding. Default utf-8
-
-- __setstate__(self, state)
- __setstate - Restore state for loading pickle
-
- @param state <dict> - The state
-
-- asHTML = getHTML(self)
-
-- contains(self, em)
- Checks if #em is found anywhere within this element tree
-
-@param em <AdvancedTag> - Tag of interest
-
-@return <bool> - If element #em is within this tree
-
-- containsUid(self, uid)
- Check if #uid is found anywhere within this element tree
-
-@param uid <uuid.UUID> - Uid
-
-@return <bool> - If #uid is found within this tree
-
-- createElement(self, tagName)
- createElement - Create an unattached tag with the given tag name
-
-@param tagName <str> - Name of tag
-
-@return <AdvancedTag> - A tag with the given tag name
-
-- feed(self, contents)
- feed - Feed contents. Use parseStr or parseFile instead.
-
-@param contents - Contents
-
-- filter(self, **kwargs)
- filter aka filterAnd - Filter ALL the elements in this DOM.
-
-Results must match ALL the filter criteria. for ANY, use the *Or methods
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-Special Keys:
-
- tagname - The tag name
- text - The inner text
-
-@return TagCollection<AdvancedTag>
-
-- filterAnd = filter(self, **kwargs)
-
-- filterOr(self, **kwargs)
- filterOr - Perform a filter operation on this node and all children (and their children, onto the end)
-
-Results must match ANY the filter criteria. for ALL, use the *AND methods
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative, consider AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-@return TagCollection<AdvancedTag>
-
-- find(self, **kwargs)
- find - Perform a search of elements using attributes as keys and potential values as values
-
- (i.e. parser.find(name='blah', tagname='span') will return all elements in this document
- with the name "blah" of the tag type "span" )
-
-Arguments are key = value, or key can equal a tuple/list of values to match ANY of those values.
-
-Append a key with __contains to test if some strs (or several possible strs) are within an element
-Append a key with __icontains to perform the same __contains op, but ignoring case
-
-Special keys:
-
- tagname - The tag name of the element
- text - The text within an element
-
-NOTE: Empty string means both "not set" and "no value" in this implementation.
-
-NOTE: If you installed the QueryableList module (i.e. ran setup.py without --no-deps) it is
- better to use the "filter"/"filterAnd" or "filterOr" methods, which are also available
- on all tags and tag collections (tag collections also have filterAllAnd and filterAllOr)
-
-
-@return TagCollection<AdvancedTag> - A list of tags that matched the filter criteria
-
-- getAllNodes(self)
- getAllNodes - Get every element
-
-@return TagCollection<AdvancedTag>
-
-- getElementById(self, _id, root='root')
- getElementById - Searches and returns the first (should only be one) element with the given ID.
-
- @param id <str> - A string of the id attribute.
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root' [default], the root of the parsed tree will be used.
-
-- getElementsByAttr(self, attrName, attrValue, root='root')
- getElementsByAttr - Searches the full tree for elements with a given attribute name and value combination. This is always a full scan.
-
- @param attrName <lowercase str> - A lowercase attribute name
- @param attrValue <str> - Expected value of attribute
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
-
-- getElementsByClassName(self, className, root='root')
- getElementsByClassName - Searches and returns all elements containing a given class name.
-
- @param className <str> - One or more space-separated class names
-
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root' [default], the root of the parsed tree will be used.
-
-- getElementsByName(self, name, root='root')
- getElementsByName - Searches and returns all elements with a specific name.
-
- @param name <str> - A string of the name attribute
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root' [default], the root of the parsed tree will be used.
-
-- getElementsByTagName(self, tagName, root='root')
- getElementsByTagName - Searches and returns all elements with a specific tag name.
-
- @param tagName <lowercase str> - A lowercase string of the tag name.
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
-
-- getElementsCustomFilter(self, filterFunc, root='root')
- getElementsCustomFilter - Scan elements using a provided function
-
-@param filterFunc <function>(node) - A function that takes an AdvancedTag as an argument, and returns True if some arbitrary criteria is met
-
-@return - TagCollection of all matching elements
-
-- getElementsWithAttrValues(self, attrName, attrValues, root='root')
- getElementsWithAttrValues - Returns elements with an attribute, named by #attrName contains one of the values in the list, #values
-
-@param attrName <lowercase str> - A lowercase attribute name
-@param attrValues set<str> - A set of all valid values.
-
-
-@return - TagCollection of all matching elements
-
-- getFirstElementCustomFilter(self, filterFunc, root='root')
- getFirstElementCustomFilter - Scan elements using a provided function, stop and return the first match.
-
- @see getElementsCustomFilter to match multiple elements
-
-@param filterFunc <function>(node) - A function that takes an AdvancedTag as an argument, and returns True if some arbitrary criteria is met
-
-@return - An AdvancedTag of the node that matched, or None if no match.
-
-- getFormattedHTML(self, indent=' ')
- getFormattedHTML - Get formatted and xhtml of this document, replacing the original whitespace
- with a pretty-printed version
-
-@param indent - space/tab/newline of each level of indent, or integer for how many spaces per level
-
-@return - <str> Formatted html
-
-@see getHTML - Get HTML with original whitespace
-
-@see getMiniHTML - Get HTML with only functional whitespace remaining
-
-- getHTML(self)
- getHTML - Get the full HTML as contained within this tree.
-
- If parsed from a document, this will contain the original whitespacing.
-
- @returns - <str> of html
-
- @see getFormattedHTML
-
- @see getMiniHTML
-
-- getMiniHTML(self)
- getMiniHTML - Gets the HTML representation of this document without any pretty formatting
- and disregarding original whitespace beyond the functional.
-
- @return <str> - HTML with only functional whitespace present
-
-- getRoot(self)
- getRoot - returns the root Tag.
-
- NOTE: if there are multiple roots, this will be a special tag.
- You may want to consider using getRootNodes instead if this
- is a possible situation for you.
-
-@return AdvancedTag
-
-- getRootNodes(self)
- getRootNodes - Gets all objects at the "root" (first level; no parent). Use this if you may have multiple roots (not children of <html>)
- Use this method to get objects, for example, in an AJAX request where <html> may not be your root.
-
- Note: If there are multiple root nodes (i.e. no <html> at the top), getRoot will return a special tag. This function automatically
- handles that, and returns all root nodes.
-
- @return list<AdvancedTag> - A list of AdvancedTags which are at the root level of the tree.
-
-- handle_charref(self, charRef)
- Internal for parsing
-
-- handle_comment(self, comment)
- Internal for parsing
-
-- handle_data(self, data)
- Internal for parsing
-
-- handle_decl(self, decl)
- Internal for parsing
-
-- handle_endtag(self, tagName)
- Internal for parsing
-
-- handle_entityref(self, entity)
- Internal for parsing
-
-- handle_startendtag(self, tagName, attributeList)
- Internal for parsing
-
-- handle_starttag(self, tagName, attributeList, isSelfClosing=False)
- Internal for parsing
-
-- parseFile(self, filename)
- parseFile - Parses a file and creates the DOM tree and indexes
-
- @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
-
-- parseStr(self, html)
- parseStr - Parses a string and creates the DOM tree and indexes.
-
- @param html <str> - valid HTML
-
-- setDoctype(self, newDoctype)
- setDoctype - Set the doctype for this document, or clear it.
-
- @param newDoctype <str/None> -
-
- If None, will clear the doctype and not return one with #getHTML
-
- Otherwise, a string of the full doctype tag.
-
- For example, the HTML5 doctype would be "DOCTYPE html"
-
-- setRoot(self, root)
- Sets the root node, and reprocesses the indexes
-
-- toHTML = getHTML(self)
-
-- unknown_decl(self, decl)
- Internal for parsing
-
-
-Class methods defined here:
-- createBlocksFromHTML(html, encoding='utf-8') from builtins.type
- createBlocksFromHTML - Returns the root level node (unless multiple nodes), and
- a list of "blocks" added (text and nodes).
-
-@return list< str/AdvancedTag > - List of blocks created. May be strings (text nodes) or AdvancedTag (tags)
-
-NOTE:
- Results may be checked by:
-
- issubclass(block.__class__, AdvancedTag)
-
- If True, block is a tag, otherwise, it is a text node
-
-- createElementFromHTML(html, encoding='utf-8') from builtins.type
- createElementFromHTML - Creates an element from a string of HTML.
-
- If this could create multiple root-level elements (children are okay),
- you must use #createElementsFromHTML which returns a list of elements created.
-
-@param html <str> - Some html data
-
-@param encoding <str> - Encoding to use for document
-
-@raises MultipleRootNodeException - If given html would produce multiple root-level elements (use #createElementsFromHTML instead)
-
-@return AdvancedTag - A single AdvancedTag
-
-NOTE: If there is text outside the tag, they will be lost in this.
- Use createBlocksFromHTML instead if you need to retain both text and tags.
-
- Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
-
-- createElementsFromHTML(html, encoding='utf-8') from builtins.type
- createElementsFromHTML - Creates elements from provided html, and returns a list of the root-level elements
- children of these root-level nodes are accessable via the usual means.
-
-@param html <str> - Some html data
-
-@param encoding <str> - Encoding to use for document
-
-@return list<AdvancedTag> - The root (top-level) tags from parsed html.
-
-NOTE: If there is text outside the tags, they will be lost in this.
- Use createBlocksFromHTML instead if you need to retain both text and tags.
-
- Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
-
-
-Data descriptors defined here:
-- body
-- body - Get the body element
-
-@return <AdvancedTag> - The body tag, or None if no body tag present
-
-- forms
-- forms - Return all forms associated with this document
-
-@return <TagCollection> - All "form" elements
-
-- head
-- head - Get the head element
-
-@return <AdvancedTag> - The head tag, or None if no head tag present
-
-
-Methods inherited from html.parser.HTMLParser:
-- check_for_whole_start_tag(self, i)
- # Internal -- check to see if we have a complete starttag; return end
-# or -1 if incomplete.
-
-- clear_cdata_mode(self)
-
-- close(self)
- Handle any buffered data.
-
-- get_starttag_text(self)
- Return full source of start tag: '<...>'.
-
-- goahead(self, end)
- # Internal -- handle data as far as reasonable. May leave state
-# and data to be processed by a subsequent call. If 'end' is
-# true, force handling all data as if followed by EOF marker.
-
-- handle_pi(self, data)
- # Overridable -- handle processing instruction
-
-- parse_bogus_comment(self, i, report=1)
- # Internal -- parse bogus comment, return length or -1 if not terminated
-# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
-
-- parse_endtag(self, i)
- # Internal -- parse endtag, return end or -1 if incomplete
-
-- parse_html_declaration(self, i)
- # Internal -- parse html declarations, return length or -1 if not terminated
-# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
-# See also parse_declaration in _markupbase
-
-- parse_pi(self, i)
- # Internal -- parse processing instr, return end or -1 if not terminated
-
-- parse_starttag(self, i)
- # Internal -- handle starttag, return end or -1 if not terminated
-
-- reset(self)
- Reset this instance. Loses all unprocessed data.
-
-- set_cdata_mode(self, elem)
-
-- unescape(self, s)
- # Internal -- helper to remove special character quoting
-
-
-Data and other attributes inherited from html.parser.HTMLParser:
-- CDATA_CONTENT_ELEMENTS = ('script', 'style')
-
-
-Methods inherited from _markupbase.ParserBase:
-- error(self, message)
-
-- getpos(self)
- Return current line number and offset.
-
-- parse_comment(self, i, report=1)
- # Internal -- parse comment, return length or -1 if not terminated
-
-- parse_declaration(self, i)
- # Internal -- parse declaration (for use by subclasses).
-
-- parse_marked_section(self, i, report=1)
- # Internal -- parse a marked section
-# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
-
-- updatepos(self, i, j)
- # Internal -- update line number and offset. This should be
-# called for each piece of data exactly once, in order -- in other
-# words the concatenation of all the input strings to this
-# function should be exactly the entire input.
-
-
-Data descriptors inherited from _markupbase.ParserBase:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
- |
-
-
-
-class AdvancedHTMLSlimTagFormatter(AdvancedHTMLFormatter) |
-
- |
-AdvancedHTMLSlimTagFormatter(indent=' ', encoding='utf-8', slimSelfClosing=False)
-
-AdvancedHTMLSlimTagFormatter - Formats HTML with slim start tags,
- which may break some xhtml-compatible parsers.
-
-For example <span id="abc" > will become <span id="abc">.
-
-Remainder will be pretty-printed. For mini-printing, @see AdvancedHTMLSlimTagMiniFormatter
-
-If slimSelfClosing=True on __init__, <br /> will become <br/> as well |
- |
-- Method resolution order:
-- AdvancedHTMLSlimTagFormatter
-- AdvancedHTMLFormatter
-- html.parser.HTMLParser
-- _markupbase.ParserBase
-- builtins.object
-
-
-Methods defined here:
-- __init__(self, indent=' ', encoding='utf-8', slimSelfClosing=False)
- __init__ - Construct an AdvancedHTMLSlimTagFormatter
-
- @see AdvancedHTMLFormatter
-
- @param slimSelfClosing <bool> Default False - If True, will use slim self-closing tags,
-
- e.x. <br /> becomes <br/>
-
-- handle_starttag = handle_starttag_slim(self, tagName, attributeList, isSelfClosing=False)
- handle_starttag_slim - Handles parsing a start tag, but with "slim" start tags
-
- @see AdvancedHTMLFormatter.handle_starttag
-
-
-Methods inherited from AdvancedHTMLFormatter:
-- feed(self, contents)
- feed - Load contents
-
-@param contents - HTML contents
-
-- getHTML(self)
- getHTML - Get the full HTML as contained within this tree, converted to valid XHTML
- @returns - String
-
-- getRoot(self)
- getRoot - returns the root Tag
- @return - AdvancedTag at root. If you provided multiple root nodes, this will be a "holder" with tagName value as constants.INVISIBLE_ROOT_TAG
-
-- getRootNodes(self)
- getRootNodes - Gets all objects at the "root" (first level; no parent). Use this if you may have multiple roots (not children of <html>)
- Use this method to get objects, for example, in an AJAX request where <html> may not be your root.
-
- Note: If there are multiple root nodes (i.e. no <html> at the top), getRoot will return a special tag. This function automatically
- handles that, and returns all root nodes.
-
- @return list<AdvancedTag> - A list of AdvancedTags which are at the root level of the tree.
-
-- handle_charref(self, charRef)
- Internal for parsing
-
-- handle_comment(self, comment)
- Internal for parsing
-
-- handle_data(self, data)
- handle_data - Internal for parsing
-
-- handle_decl(self, decl)
- Internal for parsing
-
-- handle_endtag(self, tagName)
- handle_endtag - Internal for parsing
-
-- handle_entityref(self, entity)
- Internal for parsing
-
-- handle_startendtag(self, tagName, attributeList)
- handle_startendtag - Internal for parsing
-
-- parseFile(self, filename)
- parseFile - Parses a file and creates the DOM tree and indexes
-
- @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
-
-- parseStr(self, html)
- parseStr - Parses a string and creates the DOM tree and indexes.
-
- @param html <str> - valid HTML
-
-- setRoot(self, root)
- setRoot - Sets the root node, and reprocesses the indexes
-
-@param root - AdvancedTag to be new root
-
-- unknown_decl(self, decl)
- Internal for parsing
-
-
-Methods inherited from html.parser.HTMLParser:
-- check_for_whole_start_tag(self, i)
- # Internal -- check to see if we have a complete starttag; return end
-# or -1 if incomplete.
-
-- clear_cdata_mode(self)
-
-- close(self)
- Handle any buffered data.
-
-- get_starttag_text(self)
- Return full source of start tag: '<...>'.
-
-- goahead(self, end)
- # Internal -- handle data as far as reasonable. May leave state
-# and data to be processed by a subsequent call. If 'end' is
-# true, force handling all data as if followed by EOF marker.
-
-- handle_pi(self, data)
- # Overridable -- handle processing instruction
-
-- parse_bogus_comment(self, i, report=1)
- # Internal -- parse bogus comment, return length or -1 if not terminated
-# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
-
-- parse_endtag(self, i)
- # Internal -- parse endtag, return end or -1 if incomplete
-
-- parse_html_declaration(self, i)
- # Internal -- parse html declarations, return length or -1 if not terminated
-# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
-# See also parse_declaration in _markupbase
-
-- parse_pi(self, i)
- # Internal -- parse processing instr, return end or -1 if not terminated
-
-- parse_starttag(self, i)
- # Internal -- handle starttag, return end or -1 if not terminated
-
-- reset(self)
- Reset this instance. Loses all unprocessed data.
-
-- set_cdata_mode(self, elem)
-
-- unescape(self, s)
- # Internal -- helper to remove special character quoting
-
-
-Data and other attributes inherited from html.parser.HTMLParser:
-- CDATA_CONTENT_ELEMENTS = ('script', 'style')
-
-
-Methods inherited from _markupbase.ParserBase:
-- error(self, message)
-
-- getpos(self)
- Return current line number and offset.
-
-- parse_comment(self, i, report=1)
- # Internal -- parse comment, return length or -1 if not terminated
-
-- parse_declaration(self, i)
- # Internal -- parse declaration (for use by subclasses).
-
-- parse_marked_section(self, i, report=1)
- # Internal -- parse a marked section
-# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
-
-- updatepos(self, i, j)
- # Internal -- update line number and offset. This should be
-# called for each piece of data exactly once, in order -- in other
-# words the concatenation of all the input strings to this
-# function should be exactly the entire input.
-
-
-Data descriptors inherited from _markupbase.ParserBase:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
- |
-
-
-
-class AdvancedHTMLSlimTagMiniFormatter(AdvancedHTMLMiniFormatter) |
-
- |
-AdvancedHTMLSlimTagMiniFormatter(encoding='utf-8', slimSelfClosing=False)
-
-AdvancedHTMLSlimTagMiniFormatter - A "mini" formatter that
- removes all non-functional whitespace (including all indentations)
-
-Also uses "slim" start tags, @see AdvancedHTMLSlimTagFormatter for more info |
- |
-- Method resolution order:
-- AdvancedHTMLSlimTagMiniFormatter
-- AdvancedHTMLMiniFormatter
-- AdvancedHTMLFormatter
-- html.parser.HTMLParser
-- _markupbase.ParserBase
-- builtins.object
-
-
-Methods defined here:
-- __init__(self, encoding='utf-8', slimSelfClosing=False)
- __init__ - Create an AdvancedHTMLSlimTagMiniFormatter
-
- @see AdvancedHTMLMiniFormatter
-
- @param slimSelfClosing <bool> Default False - If True, will use slim self-closing tags,
-
- e.x. <br /> becomes <br/>
-
-- handle_starttag = handle_starttag_slim(self, tagName, attributeList, isSelfClosing=False)
- handle_starttag_slim - Handles parsing a start tag, but with "slim" start tags
-
- @see AdvancedHTMLFormatter.handle_starttag
-
-
-Methods inherited from AdvancedHTMLFormatter:
-- feed(self, contents)
- feed - Load contents
-
-@param contents - HTML contents
-
-- getHTML(self)
- getHTML - Get the full HTML as contained within this tree, converted to valid XHTML
- @returns - String
-
-- getRoot(self)
- getRoot - returns the root Tag
- @return - AdvancedTag at root. If you provided multiple root nodes, this will be a "holder" with tagName value as constants.INVISIBLE_ROOT_TAG
-
-- getRootNodes(self)
- getRootNodes - Gets all objects at the "root" (first level; no parent). Use this if you may have multiple roots (not children of <html>)
- Use this method to get objects, for example, in an AJAX request where <html> may not be your root.
-
- Note: If there are multiple root nodes (i.e. no <html> at the top), getRoot will return a special tag. This function automatically
- handles that, and returns all root nodes.
-
- @return list<AdvancedTag> - A list of AdvancedTags which are at the root level of the tree.
-
-- handle_charref(self, charRef)
- Internal for parsing
-
-- handle_comment(self, comment)
- Internal for parsing
-
-- handle_data(self, data)
- handle_data - Internal for parsing
-
-- handle_decl(self, decl)
- Internal for parsing
-
-- handle_endtag(self, tagName)
- handle_endtag - Internal for parsing
-
-- handle_entityref(self, entity)
- Internal for parsing
-
-- handle_startendtag(self, tagName, attributeList)
- handle_startendtag - Internal for parsing
-
-- parseFile(self, filename)
- parseFile - Parses a file and creates the DOM tree and indexes
-
- @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
-
-- parseStr(self, html)
- parseStr - Parses a string and creates the DOM tree and indexes.
-
- @param html <str> - valid HTML
-
-- setRoot(self, root)
- setRoot - Sets the root node, and reprocesses the indexes
-
-@param root - AdvancedTag to be new root
-
-- unknown_decl(self, decl)
- Internal for parsing
-
-
-Methods inherited from html.parser.HTMLParser:
-- check_for_whole_start_tag(self, i)
- # Internal -- check to see if we have a complete starttag; return end
-# or -1 if incomplete.
-
-- clear_cdata_mode(self)
-
-- close(self)
- Handle any buffered data.
-
-- get_starttag_text(self)
- Return full source of start tag: '<...>'.
-
-- goahead(self, end)
- # Internal -- handle data as far as reasonable. May leave state
-# and data to be processed by a subsequent call. If 'end' is
-# true, force handling all data as if followed by EOF marker.
-
-- handle_pi(self, data)
- # Overridable -- handle processing instruction
-
-- parse_bogus_comment(self, i, report=1)
- # Internal -- parse bogus comment, return length or -1 if not terminated
-# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
-
-- parse_endtag(self, i)
- # Internal -- parse endtag, return end or -1 if incomplete
-
-- parse_html_declaration(self, i)
- # Internal -- parse html declarations, return length or -1 if not terminated
-# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
-# See also parse_declaration in _markupbase
-
-- parse_pi(self, i)
- # Internal -- parse processing instr, return end or -1 if not terminated
-
-- parse_starttag(self, i)
- # Internal -- handle starttag, return end or -1 if not terminated
-
-- reset(self)
- Reset this instance. Loses all unprocessed data.
-
-- set_cdata_mode(self, elem)
-
-- unescape(self, s)
- # Internal -- helper to remove special character quoting
-
-
-Data and other attributes inherited from html.parser.HTMLParser:
-- CDATA_CONTENT_ELEMENTS = ('script', 'style')
-
-
-Methods inherited from _markupbase.ParserBase:
-- error(self, message)
-
-- getpos(self)
- Return current line number and offset.
-
-- parse_comment(self, i, report=1)
- # Internal -- parse comment, return length or -1 if not terminated
-
-- parse_declaration(self, i)
- # Internal -- parse declaration (for use by subclasses).
-
-- parse_marked_section(self, i, report=1)
- # Internal -- parse a marked section
-# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
-
-- updatepos(self, i, j)
- # Internal -- update line number and offset. This should be
-# called for each piece of data exactly once, in order -- in other
-# words the concatenation of all the input strings to this
-# function should be exactly the entire input.
-
-
-Data descriptors inherited from _markupbase.ParserBase:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
- |
-
-
-
-class AdvancedTag(builtins.object) |
-
- |
-AdvancedTag(tagName, attrList=None, isSelfClosing=False, ownerDocument=None)
-
-AdvancedTag - Represents a Tag. Used with AdvancedHTMLParser to create a DOM-model
-
-Keep tag names lowercase.
-
-Use the getters and setters instead of attributes directly, or you may lose accounting. |
- |
-Methods defined here:
-- __copy__(self)
- __copy__ - Create a copy (except uid). This tag will NOT ==.
-
- but is safe to add to the same tree as its original
-
-- __deepcopy__(self, arg)
- __deepcopy__ - Create a copy (except uid) for deepcopy. This tag will NOT ==
-
- but is safe to add to the same tree as its original
-
-- __eq__(self, other)
- __eq__ - Test if this and other are THE SAME TAG.
-
-Note: this does NOT test if the tags have the same name, attributes, etc.
- Use isTagEqual to test if a tag has the same data (other than children)
-
-So for example:
-
- tag1 = document.getElementById('something')
- tag2 = copy.copy(tag1)
-
- tag1 == tag2 # This is False
- tag1.isTagEqual(tag2) # This is True
-
-- __getattribute__(self, name)
- Return getattr(self, name).
-
-- __getitem__(self, key)
-
-- __getstate__(self)
- __getstate__ - Get state for pickling
-
- @return <dict>
-
-- __hash__(self)
- Return hash(self).
-
-- __init__(self, tagName, attrList=None, isSelfClosing=False, ownerDocument=None)
- __init__ - Construct
-
- @param tagName - String of tag name. This will be lowercased!
- @param attrList - A list of tuples (key, value)
- @param isSelfClosing - True if self-closing tag ( <tagName attrs /> ) will be set to False if text or children are added.
- @param ownerDocument <None/AdvancedHTMLParser> - The parser (document) associated with this tag, or None for no association
-
-- __ne__(self, other)
- __ne__ - Test if this and other are NOT THE SAME TAG. Note
-
-Note: this does NOT test if the tags have the same name, attributes, etc.
- Use isTagEqual to test if a tag has the same data (other than children)
-
-@see AdvancedTag.__eq__
-@see AdvancedTag.isTagEqual
-
-- __repr__(self)
- __repr__ - A reconstructable representation of this AdvancedTag.
-
- TODO: Incorporate uid somehow? Without it the tags won't be the SAME TAG, but they'll be equivilant
-
-- __setattr__(self, name, value)
- __setattr__ - Called with dot-access assignment, like: myTag.attr = "value"
-
- This method applies the special HTML/JS rules to dot-access,
- and allows setting several attributes directly, and conversion on special names
- such as myTag.className -> "class" attribute
-
- @param name <str> - The name of the attribute after the dot
-
- @param value <multiple types> - The value to assign
-
- @return - The value assigned ( may not match the passed in #value, for example the attribute
- "style" takes a string value, but will return a special type StyleAttribute to support
- access with javascript-like behaviour
-
-- __setstate__(self, state)
- __setstate__ - Set state when loading pickle
-
- @param state <dict>
-
-- __str__(self)
- __str__ - Returns the HTML representation for this tag (including children).
-
- NOTE: This changed in 7.3.1 to be equivilant to self.outerHTML (or to new getHTML method, which is the same).
-
- The old method just included the start tag, the joined direct text node children, and the end tag.
- This compacts well for debug display, but doesn't give a clear picture of what's going on.
-
- The old method is still available as AdvancedTag._old__str__
-
- To revert str(myTag) back to the hold behaviour:
-
- from AdvancedHTMLParser.Tags import AdvancedTag
-
- AdvancedTag.__str__ = AdvancedTag._old__str__
-
-- addClass(self, className)
- addClass - append a class name to the end of the "class" attribute, if not present
-
- @param className <str> - The name of the class to add
-
-- append = appendBlock(self, block)
-
-- appendBlock(self, block)
- append / appendBlock - Append a block to this element. A block can be a string (text node), or an AdvancedTag (tag node)
-
-@param <str/AdvancedTag> - block to add
-
-@return - #block
-
-NOTE: To add multiple blocks, @see appendBlocks
- If you know the type, use either @see appendChild for tags or @see appendText for text
-
-- appendBlocks(self, blocks)
- appendBlocks - Append blocks to this element. A block can be a string (text node), or an AdvancedTag (tag node)
-
-@param blocks list<str/AdvancedTag> - A list, in order to append, of blocks to add.
-
-@return - #blocks
-
-NOTE: To add a single block, @see appendBlock
- If you know the type, use either @see appendChild for tags or @see appendText for text
-
-- appendChild(self, child)
- appendChild - Append a child to this element.
-
-@param child <AdvancedTag> - Append a child element to this element
-
-- appendInnerHTML(self, html)
- appendInnerHTML - Appends nodes from arbitrary HTML as if doing element.innerHTML += 'someHTML' in javascript.
-
-@param html <str> - Some HTML
-
-NOTE: If associated with a document ( AdvancedHTMLParser ), the html will use the encoding associated with
- that document.
-
-@return - None. A browser would return innerHTML, but that's somewhat expensive on a high-level node.
- So just call .innerHTML explicitly if you need that
-
-- appendNode = appendChild(self, child)
-
-- appendText(self, text)
- appendText - append some inner text
-
-- asHTML = toHTML(self)
-
-- cloneNode(self)
- cloneNode - Clone this node (tag name and attributes). Does not clone children.
-
-Tags will be equal according to isTagEqual method, but will contain a different internal
-unique id such tag origTag != origTag.cloneNode() , as is the case in JS DOM.
-
-- contains(self, other)
- contains - Check if a provided tag appears anywhere as a direct child to this node, or is this node itself.
-
- @param other <AdvancedTag> - Tag to check
-
-@return <bool> - True if #other appears anywhere beneath or is this tag, otherwise False
-
-- containsUid(self, uid)
- containsUid - Check if the uid (unique internal ID) appears anywhere as a direct child to this node, or the node itself.
-
- @param uid <uuid.UUID> - uuid to check
-
-@return <bool> - True if #uid is this node's uid, or is the uid of any children at any level down
-
-- filter(self, **kwargs)
- filter aka filterAnd - Perform a filter operation on this node and all children (and all their children, onto the end)
-
-Results must match ALL the filter criteria. for ANY, use the *Or methods
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-@return TagCollection<AdvancedTag>
-
-- filterAnd = filter(self, **kwargs)
-
-- filterOr(self, **kwargs)
- filterOr - Perform a filter operation on this node and all children (and their children, onto the end)
-
-Results must match ANY the filter criteria. for ALL, use the *AND methods
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-@return TagCollection<AdvancedTag>
-
-- getAllChildNodeUids(self)
- getAllChildNodeUids - Returns all the unique internal IDs for all children, and there children,
- so on and so forth until the end.
-
- For performing "contains node" kind of logic, this is more efficent than copying the entire nodeset
-
-@return set<uuid.UUID> A set of uuid objects
-
-- getAllChildNodes(self)
- getAllChildNodes - Gets all the children, and their children,
- and their children, and so on, all the way to the end as a TagCollection.
-
- Use .childNodes for a regular list
-
-@return TagCollection<AdvancedTag> - A TagCollection of all children (and their children recursive)
-
-- getAllNodeUids(self)
- getAllNodeUids - Returns all the unique internal IDs from getAllChildNodeUids, but also includes this tag's uid
-
-@return set<uuid.UUID> A set of uuid objects
-
-- getAllNodes(self)
- getAllNodes - Returns this node, all children, and all their children and so on till the end
-
-@return TagCollection<AdvancedTag>
-
-- getAttribute(self, attrName, defaultValue=None)
- getAttribute - Gets an attribute on this tag. Be wary using this for classname, maybe use addClass/removeClass. Attribute names are all lowercase.
- @return - The attribute value, or None if none exists.
-
-- getAttributesDict(self)
- getAttributesDict - Get a copy of all attributes as a dict map of name -> value
-
- ALL values are converted to string and copied, so modifications will not affect the original attributes.
- If you want types like "style" to work as before, you'll need to recreate those elements (like StyleAttribute(strValue) ).
-
- @return <dict ( str(name), str(value) )> - A dict of attrName to attrValue , all as strings and copies.
-
-- getAttributesList(self)
- getAttributesList - Get a copy of all attributes as a list of tuples (name, value)
-
- ALL values are converted to string and copied, so modifications will not affect the original attributes.
- If you want types like "style" to work as before, you'll need to recreate those elements (like StyleAttribute(strValue) ).
-
- @return list< tuple< str(name), str(value) > > - A list of tuples of attrName, attrValue pairs, all converted to strings.
-
- This is suitable for passing back into AdvancedTag when creating a new tag.
-
-- getBlocksTags(self)
- getBlocksTags - Returns a list of tuples referencing the blocks which are direct children of this node, and the block is an AdvancedTag.
-
- The tuples are ( block, blockIdx ) where "blockIdx" is the index of self.blocks wherein the tag resides.
-
- @return list< tuple(block, blockIdx) > - A list of tuples of child blocks which are tags and their index in the self.blocks list
-
-- getBlocksText(self)
- getBlocksText - Returns a list of tuples referencing the blocks which are direct children of this node, and the block is a text node (not an AdvancedTag)
-
- The tuples are ( block, blockIdx ) where "blockIdx" is the index of self.blocks wherein the text resides.
-
- @return list< tuple(block, blockIdx) > - A list of tuples of child blocks which are not tags and their index in the self.blocks list
-
-- getChildBlocks(self)
- getChildBlocks - Gets the child blocks, both text and tags.
-
-@see childBlocks
-
-- getChildren(self)
- getChildren - returns child nodes as a searchable TagCollection.
-
- For a plain list, use .children instead
-
- @return - TagCollection of the immediate children to this tag.
-
-- getElementById(self, _id)
- getElementById - Search children of this tag for a tag containing an id
-
-@param _id - String of id
-
-@return - AdvancedTag or None
-
-- getElementsByAttr(self, attrName, attrValue)
- getElementsByAttr - Search children of this tag for tags with an attribute name/value pair
-
-@param attrName - Attribute name (lowercase)
-@param attrValue - Attribute value
-
-@return - TagCollection of matching elements
-
-- getElementsByClassName(self, className)
- getElementsByClassName - Search children of this tag for tags containing a given class name
-
-@param className <str> - One or more space-separated class names
-
-@return - TagCollection of matching elements
-
-- getElementsByName(self, name)
- getElementsByName - Search children of this tag for tags with a given name
-
-@param name - name to search
-
-@return - TagCollection of matching elements
-
-- getElementsCustomFilter(self, filterFunc)
- getElementsCustomFilter - Searches children of this tag for those matching a provided user function
-
-@param filterFunc <function> - A function or lambda expression that should return "True" if the passed node matches criteria.
-
-@return - TagCollection of matching results
-
-@see getFirstElementCustomFilter
-
-- getElementsWithAttrValues(self, attrName, attrValues)
- getElementsWithAttrValues - Search children of this tag for tags with an attribute name and one of several values
-
-@param attrName <lowercase str> - Attribute name (lowercase)
-@param attrValues set<str> - set of acceptable attribute values
-
-@return - TagCollection of matching elements
-
-- getEndTag(self)
- getEndTag - returns the end tag representation as HTML string
-
-@return - String of end tag
-
-- getFirstElementCustomFilter(self, filterFunc)
- getFirstElementCustomFilter - Gets the first element which matches a given filter func.
-
- Scans first child, to the bottom, then next child to the bottom, etc. Does not include "self" node.
-
-@param filterFunc <function> - A function or lambda expression that should return "True" if the passed node matches criteria.
-
-@return <AdvancedTag/None> - First match, or None
-
-@see getElementsCustomFilter
-
-- getHTML = toHTML(self)
-
-- getParentElementCustomFilter(self, filterFunc)
- getParentElementCustomFilter - Runs through parent on up to document root, returning the
-
- first tag which filterFunc(tag) returns True.
-
- @param filterFunc <function/lambda> - A function or lambda expression that should return "True" if the passed node matches criteria.
-
- @return <AdvancedTag/None> - First match, or None
-
-
- @see getFirstElementCustomFilter for matches against children
-
-- getPeers(self)
- getPeers - Get elements who share a parent with this element
-
-@return - TagCollection of elements
-
-- getPeersByAttr(self, attrName, attrValue)
- getPeersByAttr - Gets peers (elements on same level) which match an attribute/value combination.
-
-@param attrName - Name of attribute
-@param attrValue - Value that must match
-
-@return - None if no parent element (error condition), otherwise a TagCollection of peers that matched.
-
-- getPeersByClassName(self, className)
- getPeersByClassName - Gets peers (elements on same level) with a given class name
-
-@param className - classname must contain this name
-
-@return - None if no parent element (error condition), otherwise a TagCollection of peers that matched.
-
-- getPeersByName(self, name)
- getPeersByName - Gets peers (elements on same level) with a given name
-
-@param name - Name to match
-
-@return - None if no parent element (error condition), otherwise a TagCollection of peers that matched.
-
-- getPeersCustomFilter(self, filterFunc)
- getPeersCustomFilter - Get elements who share a parent with this element and also pass a custom filter check
-
- @param filterFunc <lambda/function> - Passed in an element, and returns True if it should be treated as a match, otherwise False.
-
- @return <TagCollection> - Resulting peers, or None if no parent node.
-
-- getPeersWithAttrValues(self, attrName, attrValues)
- getPeersWithAttrValues - Gets peers (elements on same level) whose attribute given by #attrName
- are in the list of possible vaues #attrValues
-
-@param attrName - Name of attribute
-@param attrValues - List of possible values which will match
-
-@return - None if no parent element (error condition), otherwise a TagCollection of peers that matched.
-
-- getStartTag(self)
- getStartTag - Returns the start tag represented as HTML
-
-@return - String of start tag with attributes
-
-- getStyle(self, styleName)
- getStyle - Gets the value of a style paramater, part of the "style" attribute
-
-@param styleName - The name of the style
-
-@return - String of the value of the style. '' is no value.
-
-- getStyleDict(self)
- getStyleDict - Gets a dictionary of style attribute/value pairs.
-
-@return - OrderedDict of "style" attribute.
-
-- getTagName(self)
- getTagName - Gets the tag name of this Tag (lowercase).
-
-@return - str - name of tag
-
-- getUid(self)
- getUid - Get the AdvancedHTMLParser unique id for this tag.
-
- Each tag is given a generated uuid at create time, and copies also get their own unique identifier.
-
- This can be used to determine if two tags are the same tag, beyond just having equal attribute name/value pairs and children.
-
- This is used internally to prevent duplicates, for example a TagCollection does not allow multiple tags with the same uid
-
- @return - uuid.UUID object, representing a uuid as specified by RFC 4122, version 4.
- This object is optimized for comparison. For a string representation, str() the result, or use .hex or .variant
-
-- hasAttribute(self, attrName)
- hasAttribute - Checks for the existance of an attribute. Attribute names are all lowercase.
-
- @param attrName <str> - The attribute name
-
- @return <bool> - True or False if attribute exists by that name
-
-- hasChild(self, child)
- hasChild - Returns if #child is a DIRECT child (tag) of this node.
-
-@param child <AdvancedTag> - The tag to check
-
-@return <bool> - If #child is a direct child of this node, True. Otherwise, False.
-
-- hasChildNodes(self)
- hasChildNodes - Checks if this node has any children (tags).
-
-@return <bool> - True if this child has any children, otherwise False.
-
-- hasClass(self, className)
- hasClass - Test if this tag has a paticular class name ( class attribute )
-
-@param className - A class to search
-
-@return <bool> - True if provided class is present, otherwise False
-
-- insertAfter(self, child, afterChild)
- insertAfter - Inserts a child after #afterChild
-
-
- @param child <AdvancedTag/str> - Child block to insert
-
- @param afterChild <AdvancedTag/str> - Child block to insert after. if None, will be appended
-
-@return - The added child. Note, if it is a text block (str), the return isl NOT be linked by reference.
-
-- insertBefore(self, child, beforeChild)
- insertBefore - Inserts a child before #beforeChild
-
-
- @param child <AdvancedTag/str> - Child block to insert
-
- @param beforeChild <AdvancedTag/str> - Child block to insert before. if None, will be appended
-
-@return - The added child. Note, if it is a text block (str), the return isl NOT be linked by reference.
-
-@raises ValueError - If #beforeChild is defined and is not a child of this node
-
-- isEqualNode = __eq__(self, other)
-
-- isTagEqual(self, other)
- isTagEqual - Compare if a tag contains the same tag name and attributes as another tag,
-
- i.e. if everything between < and > parts of this tag are the same.
-
- Does NOT compare children, etc. Does NOT compare if these are the same exact tag in the html (use regular == operator for that)
-
- So for example:
-
- tag1 = document.getElementById('something')
- tag2 = copy.copy(tag1)
-
- tag1 == tag2 # This is False
- tag1.isTagEqual(tag2) # This is True
-
- @return bool - True if tags have the same name and attributes, otherwise False
-
-- remove(self)
- remove - Will remove this node from its parent, if it has a parent (thus taking it out of the HTML tree)
-
- NOTE: If you are using an IndexedAdvancedHTMLParser, calling this will NOT update the index. You MUST call
- reindex method manually.
-
-@return <bool> - While JS DOM defines no return for this function, this function will return True if a
- remove did happen, or False if no parent was set.
-
-- removeAttribute(self, attrName)
- removeAttribute - Removes an attribute, by name.
-
-@param attrName <str> - The attribute name
-
-- removeBlock(self, block)
- removeBlock - Removes a single block (text node or AdvancedTag) which is a child of this object.
-
-@param block <str/AdvancedTag> - The block (text node or AdvancedTag) to remove.
-
-@return Returns the removed block if one was removed, or None if requested block is not a child of this node.
-
-NOTE: If you know you are going to remove an AdvancedTag, @see removeChild
- If you know you are going to remove a text node, @see removeText
-
-If removing multiple blocks, @see removeBlocks
-
-- removeBlocks(self, blocks)
- removeBlock - Removes a list of blocks (the first occurance of each) from the direct children of this node.
-
-@param blocks list<str/AdvancedTag> - List of AdvancedTags for tag nodes, else strings for text nodes
-
-@return The removed blocks in each slot, or None if None removed.
-
-@see removeChild
-@see removeText
-
-For multiple, @see removeBlocks
-
-- removeChild(self, child)
- removeChild - Remove a child tag, if present.
-
- @param child <AdvancedTag> - The child to remove
-
- @return - The child [with parentNode cleared] if removed, otherwise None.
-
- NOTE: This removes a tag. If removing a text block, use #removeText function.
- If you need to remove an arbitrary block (text or AdvancedTag), @see removeBlock
-
- Removing multiple children? @see removeChildren
-
-- removeChildren(self, children)
- removeChildren - Remove multiple child AdvancedTags.
-
-@see removeChild
-
-@return list<AdvancedTag/None> - A list of all tags removed in same order as passed.
- Item is "None" if it was not attached to this node, and thus was not removed.
-
-- removeClass(self, className)
- removeClass - remove a class name if present. Returns the class name if removed, otherwise None.
-
- @param className <str> - The name of the class to remove
-
- @return <str> - The class name removed if one was removed, otherwise None if #className wasn't present
-
-- removeNode = removeChild(self, child)
-
-- removeText(self, text)
- removeText - Removes the first occurace of given text in a text node (i.e. not part of a tag)
-
-@param text <str> - text to remove
-
-@return text <str/None> - The text in that block (text node) after remove, or None if not found
-
-NOTE: To remove a node, @see removeChild
-NOTE: To remove a block (maybe a node, maybe text), @see removeBlock
-NOTE: To remove ALL occuraces of text, @see removeTextAll
-
-- removeTextAll(self, text)
- removeTextAll - Removes ALL occuraces of given text in a text node (i.e. not part of a tag)
-
-@param text <str> - text to remove
-
-@return list <str> - All text node containing #text BEFORE the text was removed.
- Empty list if no text removed
-
-NOTE: To remove a node, @see removeChild
-NOTE: To remove a block (maybe a node, maybe text), @see removeBlock
-NOTE: To remove a single occurace of text, @see removeText
-
-- setAttribute(self, attrName, attrValue)
- setAttribute - Sets an attribute. Be wary using this for classname, maybe use addClass/removeClass. Attribute names are all lowercase.
-
-@param attrName <str> - The name of the attribute
-
-@param attrValue <str> - The value of the attribute
-
-
-@raises -
-
- KeyError if #attrName is invalid name for an attribute
-
-- setAttributes(self, attributesDict)
- setAttributes - Sets several attributes at once, using a dictionary of attrName : attrValue
-
-@param attributesDict - <str:str> - New attribute names -> values
-
-@raises -
-
-- setStyle(self, styleName, styleValue)
- setStyle - Sets a style param. Example: "display", "block"
-
- If you need to set many styles on an element, use setStyles instead.
- It takes a dictionary of attribute, value pairs and applies it all in one go (faster)
-
- To remove a style, set its value to empty string.
- When all styles are removed, the "style" attribute will be nullified.
-
-@param styleName - The name of the style element
-@param styleValue - The value of which to assign the style element
-
-@return - String of current value of "style" after change is made.
-
-- setStyles(self, styleUpdatesDict)
- setStyles - Sets one or more style params.
- This all happens in one shot, so it is much much faster than calling setStyle for every value.
-
- To remove a style, set its value to empty string.
- When all styles are removed, the "style" attribute will be nullified.
-
-@param styleUpdatesDict - Dictionary of attribute : value styles.
-
-@return - String of current value of "style" after change is made.
-
-- toHTML(self)
- toHTML - Get the HTML representation of this tag and all children
-
- @return <str> - HTML with this tag as the root
-
-
-Data descriptors defined here:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
-- attributes
-- attributesDict - Returns the internal dict mapped to attributes on this object.
-
- Modifications made here WILL affect this tag, use getAttributesDict to get a copy.
-
- This is the default provider of the "attributes" property. Can be toggled to use the DOM-matching version, see @toggleAttributesDOM
-
- @return <dict> - Internal attributes
-
-- attributesDOM
-- attributes - Return a NamedNodeMap of the attributes on this object.
-
- This is a horrible method and is not used in practice anywhere sane.
-
- Please use setAttribute, getAttribute, hasAttribute methods instead.
-
- @see SpecialAttributes.NamedNodeMap
-
- This is NOT the default provider of the "attributes" property. Can be toggled to use the DOM-matching version, see @toggleAttributesDOM
-
-@return AttributeNodeMap
-
-- attributesDict
-- attributesDict - Returns the internal dict mapped to attributes on this object.
-
- Modifications made here WILL affect this tag, use getAttributesDict to get a copy.
-
- This is the default provider of the "attributes" property. Can be toggled to use the DOM-matching version, see @toggleAttributesDOM
-
- @return <dict> - Internal attributes
-
-- attributesList
-- attributesList - Returns a copy of internal attributes as a list. Same as getAttributesList method.
-
- @return list<tuple> - List of (key, value) tuples representing each attribute on this node
-
-
- @see getAttributesList
- @see attributesDict
-
-- childBlocks
-- childBlocks - Return immediate child blocks, both text and tags.
-
-@return list<AdvancedTag/str> - List of blocks associated with this node
-
-NOTE: This does what #childNodes does in JS DOM. Because for many years childNodes has returned
- ONLY tags on AdvancedHTMLParser, it would be a major change to match. Likely will be made in a future
- version.
-
-- childElementCount
-- childElementCount - Returns the number of direct children to this node
-
-@return <int> - The number of direct children to this node
-
-- childNodes
-- childNodes - returns immediate child nodes as a TagCollection
-
-@return - TagCollection of child nodes
-
-NOTE: Unlike JS DOM, this returns ONLY tags, not text blocks.
- Changing this would be a fairly-major backwards-incompatible change,
- and will likely be made in a future version.
-
- For now, use @see childBlocks method to get both text AND tags
-
-- classList
-- classList - get a copy of the list of the class names ( the "class" attribute ) for this element
-
- @return DOMTokenList<str> - A list of the class names for this element
-
-- className
-- className - property, string of 'class' attribute
-
-@return <str> - Class attribute, or empty string if not set
-
-- classNames
-- classList - get a copy of the list of the class names ( the "class" attribute ) for this element
-
- @return DOMTokenList<str> - A list of the class names for this element
-
-- firstChild
-- firstChild - property, Get the first child block, text or tag.
-
- @return <str/AdvancedTag/None> - The first child block, or None if no child blocks
-
-- firstElementChild
-- firstElementChild - property, Get the first child which is an element (AdvancedTag)
-
- @return <AdvancedTag/None> - The first element child, or None if no element child nodes
-
-- innerHTML
-- innerHTML - Returns an HTML string of the inner contents of this tag, including children.
-
-@return - String of inner contents HTML
-
-- innerText
-- innerText - property, gets the text of just this node. Use #textContent for this node and all children
-
- This is an alias of the .text property
-
- @return <str> - The text of this node
-
-- lastChild
-- lastChild - property, Get the last child block, text or tag
-
- @return <str/AdvancedTag/None> - The last child block, or None if no child blocks
-
-- lastElementChild
-- lastElementChild - property, Get the last child which is an element (AdvancedTag)
-
- @return <AdvancedTag/None> - The last element child, or None if no element child nodes
-
-- nextElementSibling
-- nextElementSibling - Returns the next sibling that is an element.
- This is the tag node following this node in the parent's list of children
-
- @return <None/AdvancedTag> - None if there are no children (tag) in the parent after this node,
- Otherwise the following element (tag)
-
-- nextSibling
-- nextSibling - Returns the next sibling. This is the child following this node in the parent's list of children.
-
- This could be text or an element. use nextSiblingElement to ensure element
-
- @return <None/str/AdvancedTag> - None if there are no nodes (text or tag) in the parent after this node,
- Otherwise the following node (text or tag)
-
-- nextSiblingElement
-- nextElementSibling - Returns the next sibling that is an element.
- This is the tag node following this node in the parent's list of children
-
- @return <None/AdvancedTag> - None if there are no children (tag) in the parent after this node,
- Otherwise the following element (tag)
-
-- nodeName
-- nodeName - Return the name of this name (tag name)
-
-- nodeType
-- nodeType - Return the type of this node (1 - ELEMENT_NODE)
-
-- nodeValue
-- nodeValue - Return the value of this node (None)
-
-- outerHTML
-- outerHTML - Returns start tag, innerHTML, and end tag as HTML string
-
-@return - String of start tag, innerHTML, and end tag
-
-- parentElement
-- parentElement - get the parent element of this node
-
- @return <AdvancedTag/None> - The parent node, or None if no parent
-
-- peers
-- peers - Get elements with same parent as this item
-
-@return - TagCollection of elements
-
-- previousElementSibling
-- previousElementSibling - Returns the previous sibling that is an element.
-
- This is the previous tag node in the parent's list of children
-
-
- @return <None/AdvancedTag> - None if there are no children (tag) in the parent before this node,
- Otherwise the previous element (tag)
-
-- previousSibling
-- previousSibling - Returns the previous sibling. This would be the previous node (text or tag) in the parent's list
-
- This could be text or an element. use previousSiblingElement to ensure element
-
-
- @return <None/str/AdvancedTag> - None if there are no nodes (text or tag) in the parent before this node,
- Otherwise the previous node (text or tag)
-
-- previousSiblingElement
-- previousElementSibling - Returns the previous sibling that is an element.
-
- This is the previous tag node in the parent's list of children
-
-
- @return <None/AdvancedTag> - None if there are no children (tag) in the parent before this node,
- Otherwise the previous element (tag)
-
-- tagBlocks
-- tagBlocks - Property.
- Returns all the blocks which are direct children of this node, where that block is a tag (not text)
-
- NOTE: This is similar to .children , and you should probably use .children instead except within this class itself
-
- @return list<AdvancedTag> - A list of direct children which are tags.
-
-- textBlocks
-- textBlocks - Property.
- Returns all the blocks which are direct children of this node, where that block is a text (not a tag)
-
- @return list<AdvancedTag> - A list of direct children which are text.
-
-- textContent
-- textContent - property, gets the text of this node and all inner nodes.
-
- Use .innerText for just this node's text
-
- @return <str> - The text of all nodes at this level or lower
-
- |
-
-
-
-class HTMLValidationException(builtins.Exception) |
-
- |
-HTMLValidationException - common baseclass for invalid-HTML validation errors |
- |
-- Method resolution order:
-- HTMLValidationException
-- builtins.Exception
-- builtins.BaseException
-- builtins.object
-
-
-Data descriptors defined here:
-- __weakref__
-- list of weak references to the object (if defined)
-
-
-Methods inherited from builtins.Exception:
-- __init__(self, /, *args, **kwargs)
- Initialize self. See help(type(self)) for accurate signature.
-
-
-Static methods inherited from builtins.Exception:
-- __new__(*args, **kwargs) from builtins.type
- Create and return a new object. See help(type) for accurate signature.
-
-
-Methods inherited from builtins.BaseException:
-- __delattr__(self, name, /)
- Implement delattr(self, name).
-
-- __getattribute__(self, name, /)
- Return getattr(self, name).
-
-- __reduce__(...)
- Helper for pickle.
-
-- __repr__(self, /)
- Return repr(self).
-
-- __setattr__(self, name, value, /)
- Implement setattr(self, name, value).
-
-- __setstate__(...)
-
-- __str__(self, /)
- Return str(self).
-
-- with_traceback(...)
- Exception.with_traceback(tb) --
-set self.__traceback__ to tb and return self.
-
-
-Data descriptors inherited from builtins.BaseException:
-- __cause__
-- exception cause
-
-- __context__
-- exception context
-
-- __dict__
-
-- __suppress_context__
-
-- __traceback__
-
-- args
-
- |
-
-
-
-class IndexedAdvancedHTMLParser(AdvancedHTMLParser) |
-
- |
-IndexedAdvancedHTMLParser(filename=None, encoding='utf-8', indexIDs=True, indexNames=True, indexClassNames=True, indexTagNames=True)
-
-An AdvancedHTMLParser that indexes for much much faster searching. If you are doing searching/validation, this is your bet.
- If you are writing/modifying, you may use this, but be sure to call reindex() after changes. |
- |
-- Method resolution order:
-- IndexedAdvancedHTMLParser
-- AdvancedHTMLParser
-- html.parser.HTMLParser
-- _markupbase.ParserBase
-- builtins.object
-
-
-Methods defined here:
-- __init__(self, filename=None, encoding='utf-8', indexIDs=True, indexNames=True, indexClassNames=True, indexTagNames=True)
- __init__ - Creates an Advanced HTML parser object, with specific indexing settings.
-
- For the various index* arguments, if True the index will be collected and use (if useIndex=True [default] on get* function)
-
- @param filename <str> - Optional filename to parse. Otherwise use parseFile or parseStr methods.
- @param encoding <str> - Specifies the document encoding. Default utf-8
- @param indexIDs <bool> - True to create an index for getElementByID method. <default True>
- @param indexNames <bool> - True to create an index for getElementsByName method <default True>
- @param indexClassNames <bool> - True to create an index for getElementsByClassName method. <default True>
- @param indexTagNames <bool> - True to create an index for tag names. <default True>
-
- For indexing other attributes, see the more generic addIndexOnAttribute
-
-- addIndexOnAttribute(self, attributeName)
- addIndexOnAttribute - Add an index for an arbitrary attribute. This will be used by the getElementsByAttr function.
- You should do this prior to parsing, or call reindex. Otherwise it will be blank. "name" and "id" will have no effect.
-
- @param attributeName <lowercase str> - An attribute name. Will be lowercased.
-
-- disableIndexing(self)
- disableIndexing - Disables indexing. Consider using plain AdvancedHTMLParser class.
- Maybe useful in some scenarios where you want to parse, add a ton of elements, then index
- and do a bunch of searching.
-
-- getElementById(self, _id, root='root', useIndex=True)
- getElementById - Searches and returns the first (should only be one) element with the given ID.
-
- @param id <str> - A string of the id attribute.
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
- @param useIndex <bool> If useIndex is True and ids are indexed [see constructor] only the index will be used. Otherwise a full search is performed.
-
-- getElementsByAttr(self, attrName, attrValue, root='root', useIndex=True)
- getElementsByAttr - Searches the full tree for elements with a given attribute name and value combination. If you want multiple potential values, see getElementsWithAttrValues
- If you want an index on a random attribute, use the addIndexOnAttribute function.
-
- @param attrName <lowercase str> - A lowercase attribute name
- @param attrValue <str> - Expected value of attribute
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
- @param useIndex <bool> If useIndex is True and this specific attribute is indexed [see addIndexOnAttribute] only the index will be used. Otherwise a full search is performed.
-
-- getElementsByClassName(self, className, root='root', useIndex=True)
- getElementsByClassName - Searches and returns all elements containing a given class name.
-
-
- @param className <str> - One or more space-separated class names
-
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
-
- @param useIndex <bool> If useIndex is True and class names are indexed [see constructor] only the index will be used. Otherwise a full search is performed.
-
-- getElementsByName(self, name, root='root', useIndex=True)
- getElementsByName - Searches and returns all elements with a specific name.
-
- @param name <str> - A string of the name attribute
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
- @param useIndex <bool> If useIndex is True and names are indexed [see constructor] only the index will be used. Otherwise a full search is performed.
-
-- getElementsByTagName(self, tagName, root='root', useIndex=True)
- getElementsByTagName - Searches and returns all elements with a specific tag name.
-
- @param tagName <lowercase str> - A lowercase string of the tag name.
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
- @param useIndex - If True [default] and tag names are set to be indexed [default, see constructor], only the index will be used. If False, all tags
- will be searched.
-
-- getElementsWithAttrValues(self, attrName, values, root='root', useIndex=True)
- getElementsWithAttrValues - Returns elements with an attribute matching one of several values. For a single name/value combination, see getElementsByAttr
-
- @param attrName <lowercase str> - A lowercase attribute name
- @param attrValues set<str> - List of expected values of attribute
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
- @param useIndex <bool> If useIndex is True and this specific attribute is indexed [see addIndexOnAttribute] only the index will be used. Otherwise a full search is performed.
-
-- handle_starttag(self, tagName, attributeList, isSelfClosing=False)
- internal for parsing
-
-- reindex(self, newIndexIDs=None, newIndexNames=None, newIndexClassNames=None, newIndexTagNames=None)
- reindex - reindex the tree. Optionally, change what fields are indexed.
-
- @param newIndexIDs <bool/None> - None to leave same, otherwise new value to index IDs
- @parma newIndexNames <bool/None> - None to leave same, otherwise new value to index names
- @param newIndexClassNames <bool/None> - None to leave same, otherwise new value to index class names
- @param newIndexTagNames <bool/None> - None to leave same, otherwise new value to index tag names
-
-- removeIndexOnAttribute(self, attributeName)
- removeIndexOnAttribute - Remove an attribute from indexing (for getElementsByAttr function) and remove indexed data.
-
-@param attributeName <lowercase str> - An attribute name. Will be lowercased. "name" and "id" will have no effect.
-
-- setRoot(self, root)
- Sets the root node, and reprocesses the indexes
-
-@param root - AdvancedTag for root
-
-
-Methods inherited from AdvancedHTMLParser:
-- __contains__(self, other)
-
-- __getstate__(self)
- __getstate__ - Get state for pickling
-
- @return <dict>
-
-- __setstate__(self, state)
- __setstate - Restore state for loading pickle
-
- @param state <dict> - The state
-
-- asHTML = getHTML(self)
-
-- contains(self, em)
- Checks if #em is found anywhere within this element tree
-
-@param em <AdvancedTag> - Tag of interest
-
-@return <bool> - If element #em is within this tree
-
-- containsUid(self, uid)
- Check if #uid is found anywhere within this element tree
-
-@param uid <uuid.UUID> - Uid
-
-@return <bool> - If #uid is found within this tree
-
-- createElement(self, tagName)
- createElement - Create an unattached tag with the given tag name
-
-@param tagName <str> - Name of tag
-
-@return <AdvancedTag> - A tag with the given tag name
-
-- feed(self, contents)
- feed - Feed contents. Use parseStr or parseFile instead.
-
-@param contents - Contents
-
-- filter(self, **kwargs)
- filter aka filterAnd - Filter ALL the elements in this DOM.
-
-Results must match ALL the filter criteria. for ANY, use the *Or methods
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-Special Keys:
-
- tagname - The tag name
- text - The inner text
-
-@return TagCollection<AdvancedTag>
-
-- filterAnd = filter(self, **kwargs)
-
-- filterOr(self, **kwargs)
- filterOr - Perform a filter operation on this node and all children (and their children, onto the end)
-
-Results must match ANY the filter criteria. for ALL, use the *AND methods
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative, consider AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-@return TagCollection<AdvancedTag>
-
-- find(self, **kwargs)
- find - Perform a search of elements using attributes as keys and potential values as values
-
- (i.e. parser.find(name='blah', tagname='span') will return all elements in this document
- with the name "blah" of the tag type "span" )
-
-Arguments are key = value, or key can equal a tuple/list of values to match ANY of those values.
-
-Append a key with __contains to test if some strs (or several possible strs) are within an element
-Append a key with __icontains to perform the same __contains op, but ignoring case
-
-Special keys:
-
- tagname - The tag name of the element
- text - The text within an element
-
-NOTE: Empty string means both "not set" and "no value" in this implementation.
-
-NOTE: If you installed the QueryableList module (i.e. ran setup.py without --no-deps) it is
- better to use the "filter"/"filterAnd" or "filterOr" methods, which are also available
- on all tags and tag collections (tag collections also have filterAllAnd and filterAllOr)
-
-
-@return TagCollection<AdvancedTag> - A list of tags that matched the filter criteria
-
-- getAllNodes(self)
- getAllNodes - Get every element
-
-@return TagCollection<AdvancedTag>
-
-- getElementsCustomFilter(self, filterFunc, root='root')
- getElementsCustomFilter - Scan elements using a provided function
-
-@param filterFunc <function>(node) - A function that takes an AdvancedTag as an argument, and returns True if some arbitrary criteria is met
-
-@return - TagCollection of all matching elements
-
-- getFirstElementCustomFilter(self, filterFunc, root='root')
- getFirstElementCustomFilter - Scan elements using a provided function, stop and return the first match.
-
- @see getElementsCustomFilter to match multiple elements
-
-@param filterFunc <function>(node) - A function that takes an AdvancedTag as an argument, and returns True if some arbitrary criteria is met
-
-@return - An AdvancedTag of the node that matched, or None if no match.
-
-- getFormattedHTML(self, indent=' ')
- getFormattedHTML - Get formatted and xhtml of this document, replacing the original whitespace
- with a pretty-printed version
-
-@param indent - space/tab/newline of each level of indent, or integer for how many spaces per level
-
-@return - <str> Formatted html
-
-@see getHTML - Get HTML with original whitespace
-
-@see getMiniHTML - Get HTML with only functional whitespace remaining
-
-- getHTML(self)
- getHTML - Get the full HTML as contained within this tree.
-
- If parsed from a document, this will contain the original whitespacing.
-
- @returns - <str> of html
-
- @see getFormattedHTML
-
- @see getMiniHTML
-
-- getMiniHTML(self)
- getMiniHTML - Gets the HTML representation of this document without any pretty formatting
- and disregarding original whitespace beyond the functional.
-
- @return <str> - HTML with only functional whitespace present
-
-- getRoot(self)
- getRoot - returns the root Tag.
-
- NOTE: if there are multiple roots, this will be a special tag.
- You may want to consider using getRootNodes instead if this
- is a possible situation for you.
-
-@return AdvancedTag
-
-- getRootNodes(self)
- getRootNodes - Gets all objects at the "root" (first level; no parent). Use this if you may have multiple roots (not children of <html>)
- Use this method to get objects, for example, in an AJAX request where <html> may not be your root.
-
- Note: If there are multiple root nodes (i.e. no <html> at the top), getRoot will return a special tag. This function automatically
- handles that, and returns all root nodes.
-
- @return list<AdvancedTag> - A list of AdvancedTags which are at the root level of the tree.
-
-- handle_charref(self, charRef)
- Internal for parsing
-
-- handle_comment(self, comment)
- Internal for parsing
-
-- handle_data(self, data)
- Internal for parsing
-
-- handle_decl(self, decl)
- Internal for parsing
-
-- handle_endtag(self, tagName)
- Internal for parsing
-
-- handle_entityref(self, entity)
- Internal for parsing
-
-- handle_startendtag(self, tagName, attributeList)
- Internal for parsing
-
-- parseFile(self, filename)
- parseFile - Parses a file and creates the DOM tree and indexes
-
- @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
-
-- parseStr(self, html)
- parseStr - Parses a string and creates the DOM tree and indexes.
-
- @param html <str> - valid HTML
-
-- setDoctype(self, newDoctype)
- setDoctype - Set the doctype for this document, or clear it.
-
- @param newDoctype <str/None> -
-
- If None, will clear the doctype and not return one with #getHTML
-
- Otherwise, a string of the full doctype tag.
-
- For example, the HTML5 doctype would be "DOCTYPE html"
-
-- toHTML = getHTML(self)
-
-- unknown_decl(self, decl)
- Internal for parsing
-
-
-Class methods inherited from AdvancedHTMLParser:
-- createBlocksFromHTML(html, encoding='utf-8') from builtins.type
- createBlocksFromHTML - Returns the root level node (unless multiple nodes), and
- a list of "blocks" added (text and nodes).
-
-@return list< str/AdvancedTag > - List of blocks created. May be strings (text nodes) or AdvancedTag (tags)
-
-NOTE:
- Results may be checked by:
-
- issubclass(block.__class__, AdvancedTag)
-
- If True, block is a tag, otherwise, it is a text node
-
-- createElementFromHTML(html, encoding='utf-8') from builtins.type
- createElementFromHTML - Creates an element from a string of HTML.
-
- If this could create multiple root-level elements (children are okay),
- you must use #createElementsFromHTML which returns a list of elements created.
-
-@param html <str> - Some html data
-
-@param encoding <str> - Encoding to use for document
-
-@raises MultipleRootNodeException - If given html would produce multiple root-level elements (use #createElementsFromHTML instead)
-
-@return AdvancedTag - A single AdvancedTag
-
-NOTE: If there is text outside the tag, they will be lost in this.
- Use createBlocksFromHTML instead if you need to retain both text and tags.
-
- Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
-
-- createElementsFromHTML(html, encoding='utf-8') from builtins.type
- createElementsFromHTML - Creates elements from provided html, and returns a list of the root-level elements
- children of these root-level nodes are accessable via the usual means.
-
-@param html <str> - Some html data
-
-@param encoding <str> - Encoding to use for document
-
-@return list<AdvancedTag> - The root (top-level) tags from parsed html.
-
-NOTE: If there is text outside the tags, they will be lost in this.
- Use createBlocksFromHTML instead if you need to retain both text and tags.
-
- Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
-
-
-Data descriptors inherited from AdvancedHTMLParser:
-- body
-- body - Get the body element
-
-@return <AdvancedTag> - The body tag, or None if no body tag present
-
-- forms
-- forms - Return all forms associated with this document
-
-@return <TagCollection> - All "form" elements
-
-- head
-- head - Get the head element
-
-@return <AdvancedTag> - The head tag, or None if no head tag present
-
-
-Methods inherited from html.parser.HTMLParser:
-- check_for_whole_start_tag(self, i)
- # Internal -- check to see if we have a complete starttag; return end
-# or -1 if incomplete.
-
-- clear_cdata_mode(self)
-
-- close(self)
- Handle any buffered data.
-
-- get_starttag_text(self)
- Return full source of start tag: '<...>'.
-
-- goahead(self, end)
- # Internal -- handle data as far as reasonable. May leave state
-# and data to be processed by a subsequent call. If 'end' is
-# true, force handling all data as if followed by EOF marker.
-
-- handle_pi(self, data)
- # Overridable -- handle processing instruction
-
-- parse_bogus_comment(self, i, report=1)
- # Internal -- parse bogus comment, return length or -1 if not terminated
-# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
-
-- parse_endtag(self, i)
- # Internal -- parse endtag, return end or -1 if incomplete
-
-- parse_html_declaration(self, i)
- # Internal -- parse html declarations, return length or -1 if not terminated
-# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
-# See also parse_declaration in _markupbase
-
-- parse_pi(self, i)
- # Internal -- parse processing instr, return end or -1 if not terminated
-
-- parse_starttag(self, i)
- # Internal -- handle starttag, return end or -1 if not terminated
-
-- reset(self)
- Reset this instance. Loses all unprocessed data.
-
-- set_cdata_mode(self, elem)
-
-- unescape(self, s)
- # Internal -- helper to remove special character quoting
-
-
-Data and other attributes inherited from html.parser.HTMLParser:
-- CDATA_CONTENT_ELEMENTS = ('script', 'style')
-
-
-Methods inherited from _markupbase.ParserBase:
-- error(self, message)
-
-- getpos(self)
- Return current line number and offset.
-
-- parse_comment(self, i, report=1)
- # Internal -- parse comment, return length or -1 if not terminated
-
-- parse_declaration(self, i)
- # Internal -- parse declaration (for use by subclasses).
-
-- parse_marked_section(self, i, report=1)
- # Internal -- parse a marked section
-# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
-
-- updatepos(self, i, j)
- # Internal -- update line number and offset. This should be
-# called for each piece of data exactly once, in order -- in other
-# words the concatenation of all the input strings to this
-# function should be exactly the entire input.
-
-
-Data descriptors inherited from _markupbase.ParserBase:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
- |
-
-
-
-class InvalidCloseException(HTMLValidationException) |
-
- |
-InvalidCloseException(triedToClose, stillOpen)
-
-InvalidCloseException - Raised when a tag is closed that shouldn't be closed in validating parser |
- |
-- Method resolution order:
-- InvalidCloseException
-- HTMLValidationException
-- builtins.Exception
-- builtins.BaseException
-- builtins.object
-
-
-Methods defined here:
-- __init__(self, triedToClose, stillOpen)
- Initialize self. See help(type(self)) for accurate signature.
-
-
-Data descriptors inherited from HTMLValidationException:
-- __weakref__
-- list of weak references to the object (if defined)
-
-
-Static methods inherited from builtins.Exception:
-- __new__(*args, **kwargs) from builtins.type
- Create and return a new object. See help(type) for accurate signature.
-
-
-Methods inherited from builtins.BaseException:
-- __delattr__(self, name, /)
- Implement delattr(self, name).
-
-- __getattribute__(self, name, /)
- Return getattr(self, name).
-
-- __reduce__(...)
- Helper for pickle.
-
-- __repr__(self, /)
- Return repr(self).
-
-- __setattr__(self, name, value, /)
- Implement setattr(self, name, value).
-
-- __setstate__(...)
-
-- __str__(self, /)
- Return str(self).
-
-- with_traceback(...)
- Exception.with_traceback(tb) --
-set self.__traceback__ to tb and return self.
-
-
-Data descriptors inherited from builtins.BaseException:
-- __cause__
-- exception cause
-
-- __context__
-- exception context
-
-- __dict__
-
-- __suppress_context__
-
-- __traceback__
-
-- args
-
- |
-
-
-
-class MissedCloseException(HTMLValidationException) |
-
- |
-MissedCloseException(triedToClose, stillOpen)
-
-MissedCloseException - Raised when a close was missed in validating parser |
- |
-- Method resolution order:
-- MissedCloseException
-- HTMLValidationException
-- builtins.Exception
-- builtins.BaseException
-- builtins.object
-
-
-Methods defined here:
-- __init__(self, triedToClose, stillOpen)
- Initialize self. See help(type(self)) for accurate signature.
-
-
-Data descriptors inherited from HTMLValidationException:
-- __weakref__
-- list of weak references to the object (if defined)
-
-
-Static methods inherited from builtins.Exception:
-- __new__(*args, **kwargs) from builtins.type
- Create and return a new object. See help(type) for accurate signature.
-
-
-Methods inherited from builtins.BaseException:
-- __delattr__(self, name, /)
- Implement delattr(self, name).
-
-- __getattribute__(self, name, /)
- Return getattr(self, name).
-
-- __reduce__(...)
- Helper for pickle.
-
-- __repr__(self, /)
- Return repr(self).
-
-- __setattr__(self, name, value, /)
- Implement setattr(self, name, value).
-
-- __setstate__(...)
-
-- __str__(self, /)
- Return str(self).
-
-- with_traceback(...)
- Exception.with_traceback(tb) --
-set self.__traceback__ to tb and return self.
-
-
-Data descriptors inherited from builtins.BaseException:
-- __cause__
-- exception cause
-
-- __context__
-- exception context
-
-- __dict__
-
-- __suppress_context__
-
-- __traceback__
-
-- args
-
- |
-
-
-
-class MultipleRootNodeException(builtins.Exception) |
-
- |
-Exception raised and used internally when you try to use multiple root nodes
- Example:
- <one>
- <b>Hi</b>
- <i>Hello</i>
- </one>
- <two>
- <b>Cheese</b>
- <i>Ssdf</i>
- </two>
-
-This is legal, a fake root node with tag name of constants.INVISIBLE_TAG_NAME will be set at head, and all methods will handle it correctly.
-If you need to get the root nodes, and there's the possibility of more than one, consider getRootObjects instead of getRoot. |
- |
-- Method resolution order:
-- MultipleRootNodeException
-- builtins.Exception
-- builtins.BaseException
-- builtins.object
-
-
-Data descriptors defined here:
-- __weakref__
-- list of weak references to the object (if defined)
-
-
-Methods inherited from builtins.Exception:
-- __init__(self, /, *args, **kwargs)
- Initialize self. See help(type(self)) for accurate signature.
-
-
-Static methods inherited from builtins.Exception:
-- __new__(*args, **kwargs) from builtins.type
- Create and return a new object. See help(type) for accurate signature.
-
-
-Methods inherited from builtins.BaseException:
-- __delattr__(self, name, /)
- Implement delattr(self, name).
-
-- __getattribute__(self, name, /)
- Return getattr(self, name).
-
-- __reduce__(...)
- Helper for pickle.
-
-- __repr__(self, /)
- Return repr(self).
-
-- __setattr__(self, name, value, /)
- Implement setattr(self, name, value).
-
-- __setstate__(...)
-
-- __str__(self, /)
- Return str(self).
-
-- with_traceback(...)
- Exception.with_traceback(tb) --
-set self.__traceback__ to tb and return self.
-
-
-Data descriptors inherited from builtins.BaseException:
-- __cause__
-- exception cause
-
-- __context__
-- exception context
-
-- __dict__
-
-- __suppress_context__
-
-- __traceback__
-
-- args
-
- |
-
-
-
-class StyleAttribute(builtins.object) |
-
- |
-StyleAttribute(styleValue, tag=None)
-
-StyleAttribute - Represents the "style" field on a tag. |
- |
-Methods defined here:
-- __copy__(self)
-
-- __deepcopy__(self, memo)
-
-- __eq__(self, other)
- __eq__ - Test if two "style" tag properties are equal.
-
- NOTE: This differs from javascript. In javascript, no two styles equal eachother, it's
- an identity comparison not a value comparison.
-
- I don't understand how that is useful, but in a future version we may choose to adopt
- that "feature" and export comparison into a different "isSaneAs(otherStyle)" function
-
- @param other<StyleAttribute> - The other style attribute map.
-
-- __getattribute__(self, name)
- __getattribute__ - used on dot (.) access on a Style element.
-
-@param name <str> - The style attribute name
-
- NOTE: This should the camelCase name (like paddingTop)
-
-@return <str> - The attribute value or empty string if not set
-
-- __init__(self, styleValue, tag=None)
- __init__ - Create a StyleAttribute object.
-
-@param styleValue <str> - A style string ( like "display: none; padding-top: 5px" )
-
-- __ne__(self, other)
- Return self!=value.
-
-- __repr__(self)
- Return repr(self).
-
-- __setattr__(self, name, val)
- __setattr__ - Used to set an attribute using dot (.) access on a Style element
-
-@param name <str> - The attribute name
-
- NOTE: This must be the camelCase name (like paddingTop).
-
-@param val <str> - The value of the attribute
-
-- __str__(self)
- Return str(self).
-
-- isEmpty(self)
- isEmpty - Check if this is an "empty" style (no attributes set)
-
- @return <bool> - True if no attributes are set, otherwise False
-
-- setProperty(self, name, value)
- setProperty - Set a style property to a value.
-
- NOTE: To remove a style, use a value of empty string, or None
-
- @param name <str> - The style name.
-
- NOTE: The dash names are expected here, whereas dot-access expects the camel case names.
-
- Example: name="font-weight" versus the dot-access style.fontWeight
-
- @param value <str> - The style value, or empty string to remove property
-
-- setTag(self, tag)
- setTag - Set the tag association for this style.
-
- This will handle the underlying weakref to the tag.
-
- Call setTag(None) to clear the association, otherwise setTag(tag) to associate this style to that tag.
-
-
- @param tag <AdvancedTag/None> - The new association. If None, the association is cleared, otherwise the passed tag
- becomes associated with this style.
-
-
-Static methods defined here:
-- camelCaseToDashName(camelCase)
- camelCaseToDashName - Convert a camel case name to a dash-name (like paddingTop to padding-top)
-
-@param camelCase <str> - A camel-case string
-
-@return <str> - A dash-name
-
-- dashNameToCamelCase(dashName)
- dashNameToCamelCase - Converts a "dash name" (like padding-top) to its camel-case name ( like "paddingTop" )
-
-@param dashName <str> - A name containing dashes
-
- NOTE: This method is currently unused, but may be used in the future. kept for completeness.
-
-@return <str> - The camel-case form
-
-- styleToDict(styleStr)
- getStyleDict - Gets a dictionary of style attribute/value pairs.
-
- NOTE: dash-names (like padding-top) are used here
-
-@return - OrderedDict of "style" attribute.
-
-
-Data descriptors defined here:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
-- tag
-- tag - Property (dot-access variable) which will return the associated tag, if any.
-
- This method should be used for access to handle the weakref.
-
- @see setTag - Method to set or remove the tag association
-
- @return <AdvancedTag/None> - If a tag is associated with this style, it will be returned.
- Otherwise, None will be returned
-
-
-Data and other attributes defined here:
-- RESERVED_ATTRIBUTES = ('_styleValue', '_styleDict', '_asStr', '_ensureHtmlAttribute', 'tag', '_tagRef', 'setTag', 'isEmpty', 'setProperty')
-
-- __hash__ = None
-
- |
-
-
-
-class TagCollection(builtins.list) |
-
- |
-TagCollection(values=None)
-
-A collection of AdvancedTags. You may use this like a normal list, or you can use the various getElements* functions within to operate on the results.
-Generally, this is the return of all get* functions.
-
-All the get* functions called on a TagCollection search all contained elements and their childrens. If you need to check ONLY the elements in the tag collection, and not their children,
-either provide your own list comprehension to do so, or use the "filterCollection" method, which takes an arbitrary function/lambda expression and filters just the immediate tags. |
- |
-- Method resolution order:
-- TagCollection
-- builtins.list
-- builtins.object
-
-
-Methods defined here:
-- __add__(self, others)
- Return self+value.
-
-- __iadd__(self, others)
- Implement self+=value.
-
-- __init__(self, values=None)
- Create this object.
-
-@param values - Initial values, or None for empty
-
-- __isub__(self, others)
-
-- __repr__(self)
- Return repr(self).
-
-- __sub__(self, others)
-
-- all(self)
- all - A plain list of these elements
-
-@return - List of these elements
-
-- append(self, tag)
- append - Append an item to this tag collection
-
-@param tag - an AdvancedTag
-
-- contains(self, em)
- contains - Check if #em occurs within any of the elements within this list, as themselves or as a child, any
- number of levels down.
-
- To check if JUST an element is contained within this list directly, use the "in" operator.
-
-@param em <AdvancedTag> - Element of interest
-
-@return <bool> - True if contained, otherwise False
-
-- containsUid(self, uid)
- containsUid - Check if #uid is the uid (unique internal identifier) of any of the elements within this list,
- as themselves or as a child, any number of levels down.
-
-
-@param uid <uuid.UUID> - uuid of interest
-
-@return <bool> - True if contained, otherwise False
-
-- filter(self, **kwargs)
- filter aka filterAnd - Perform a filter operation on ALL nodes in this collection (NOT including children, see #filterAnd for that)
-
-Results must match ALL the filter criteria. for ANY, use the *Or methods
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-
-@return TagCollection<AdvancedTag>
-
-- filterAll(self, **kwargs)
- filterAll aka filterAllAnd - Perform a filter operation on ALL nodes in this collection and all their children.
-
-Results must match ALL the filter criteria. for ANY, use the *Or methods
-
-For just the nodes in this collection, use "filter" or "filterAnd" on a TagCollection
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-@return TagCollection<AdvancedTag>
-
-- filterAllOr(self, **kwargs)
- filterAllOr - Perform a filter operation on ALL nodes in this collection and all their children.
-
-Results must match ANY the filter criteria. for ALL, use the *And methods
-
-For just the nodes in this collection, use "filterOr" on a TagCollection
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-
-@return TagCollection<AdvancedTag>
-
-- filterAnd = filter(self, **kwargs)
-
-- filterCollection(self, filterFunc)
- filterCollection - Filters only the immediate objects contained within this Collection against a function, not including any children
-
-@param filterFunc <function> - A function or lambda expression that returns True to have that element match
-
-@return TagCollection<AdvancedTag>
-
-- filterOr(self, **kwargs)
- filterOr - Perform a filter operation on the nodes in this collection (NOT including children, see #filterAllOr for that)
-
-Results must match ANY the filter criteria. for ALL, use the *And methods
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-
-@return TagCollection<AdvancedTag>
-
-- getAllNodeUids(self)
- getAllNodeUids - Gets all the internal uids of all nodes, their children, and all their children so on..
-
- @return set<uuid.UUID>
-
-- getAllNodes(self)
- getAllNodes - Gets all the nodes, and all their children for every node within this collection
-
-- getElementById(self, _id)
- getElementById - Gets an element within this collection by id
-
-@param _id - string of "id" attribute
-
-@return - a single tag matching the id, or None if none found
-
-- getElementsByAttr(self, attr, value)
- getElementsByAttr - Get elements within this collection posessing a given attribute/value pair
-
-@param attr - Attribute name (lowercase)
-@param value - Matching value
-
-@return - TagCollection of all elements matching name/value
-
-- getElementsByClassName(self, className)
- getElementsByClassName - Get elements within this collection containing a specific class name
-
-@param className <str> - One or more space-separated class names
-
-@return - TagCollection of unique elements within this collection tagged with a specific class name
-
-- getElementsByName(self, name)
- getElementsByName - Get elements within this collection having a specific name
-
-@param name - String of "name" attribute
-
-@return - TagCollection of unique elements within this collection with given "name"
-
-- getElementsByTagName(self, tagName)
- getElementsByTagName - Gets elements within this collection having a specific tag name
-
-@param tagName - String of tag name
-
-@return - TagCollection of unique elements within this collection with given tag name
-
-- getElementsCustomFilter(self, filterFunc)
- getElementsCustomFilter - Get elements within this collection that match a user-provided function.
-
-@param filterFunc <function> - A function that returns True if the element matches criteria
-
-@return - TagCollection of all elements that matched criteria
-
-- getElementsWithAttrValues(self, attr, values)
- getElementsWithAttrValues - Get elements within this collection possessing an attribute name matching one of several values
-
-@param attr <lowercase str> - Attribute name (lowerase)
-@param values set<str> - Set of possible matching values
-
-@return - TagCollection of all elements matching criteria
-
-- remove(self, toRemove)
- remove - Remove an item from this tag collection
-
-@param toRemove - an AdvancedTag
-
-
-Data descriptors defined here:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
-
-Data and other attributes defined here:
-- filterAllAnd = <class 'filter'>
- filter(function or None, iterable) --> filter object
-
-Return an iterator yielding those items of iterable for which function(item)
-is true. If function is None, return the items that are true.
-
-
-Methods inherited from builtins.list:
-- __contains__(self, key, /)
- Return key in self.
-
-- __delitem__(self, key, /)
- Delete self[key].
-
-- __eq__(self, value, /)
- Return self==value.
-
-- __ge__(self, value, /)
- Return self>=value.
-
-- __getattribute__(self, name, /)
- Return getattr(self, name).
-
-- __getitem__(...)
- x.__getitem__(y) <==> x[y]
-
-- __gt__(self, value, /)
- Return self>value.
-
-- __imul__(self, value, /)
- Implement self*=value.
-
-- __iter__(self, /)
- Implement iter(self).
-
-- __le__(self, value, /)
- Return self<=value.
-
-- __len__(self, /)
- Return len(self).
-
-- __lt__(self, value, /)
- Return self<value.
-
-- __mul__(self, value, /)
- Return self*value.
-
-- __ne__(self, value, /)
- Return self!=value.
-
-- __reversed__(self, /)
- Return a reverse iterator over the list.
-
-- __rmul__(self, value, /)
- Return value*self.
-
-- __setitem__(self, key, value, /)
- Set self[key] to value.
-
-- __sizeof__(self, /)
- Return the size of the list in memory, in bytes.
-
-- clear(self, /)
- Remove all items from list.
-
-- copy(self, /)
- Return a shallow copy of the list.
-
-- count(self, value, /)
- Return number of occurrences of value.
-
-- extend(self, iterable, /)
- Extend list by appending elements from the iterable.
-
-- index(self, value, start=0, stop=9223372036854775807, /)
- Return first index of value.
-
-Raises ValueError if the value is not present.
-
-- insert(self, index, object, /)
- Insert object before index.
-
-- pop(self, index=-1, /)
- Remove and return item at index (default last).
-
-Raises IndexError if list is empty or index is out of range.
-
-- reverse(self, /)
- Reverse *IN PLACE*.
-
-- sort(self, /, *, key=None, reverse=False)
- Stable sort *IN PLACE*.
-
-
-Static methods inherited from builtins.list:
-- __new__(*args, **kwargs) from builtins.type
- Create and return a new object. See help(type) for accurate signature.
-
-
-Data and other attributes inherited from builtins.list:
-- __hash__ = None
-
- |
-
-
-
-class ValidatingAdvancedHTMLParser(AdvancedHTMLParser.Parser.AdvancedHTMLParser) |
-
- |
-ValidatingAdvancedHTMLParser(filename=None, encoding='utf-8')
-
-ValidatingAdvancedHTMLParser - A parser which will raise Exceptions for a couple HTML errors that would otherwise cause
- an assumption to be made during parsing.
-
-exceptions.InvalidCloseException - The parsed string/file tried to close something it shouldn't have.
-exceptions.MissedCloseException - The parsed string/file missed closing an item. |
- |
-- Method resolution order:
-- ValidatingAdvancedHTMLParser
-- AdvancedHTMLParser.Parser.AdvancedHTMLParser
-- html.parser.HTMLParser
-- _markupbase.ParserBase
-- builtins.object
-
-
-Methods defined here:
-- handle_endtag(self, tagName)
- Internal for parsing
-
-- handle_starttag(self, tagName, attributeList, isSelfClosing=False)
- handle_starttag - internal for parsing,
-
- ValidatingAdvancedHTMLParser will run through the attributes list and make sure
- none have an invalid name, or will raise an error.
-
-
- @raises - InvalidAttributeNameException if an attribute name is passed with invalid character(s)
-
-
-Methods inherited from AdvancedHTMLParser.Parser.AdvancedHTMLParser:
-- __contains__(self, other)
-
-- __getstate__(self)
- __getstate__ - Get state for pickling
-
- @return <dict>
-
-- __init__(self, filename=None, encoding='utf-8')
- __init__ - Creates an Advanced HTML parser object. For read-only parsing, consider IndexedAdvancedHTMLParser for faster searching.
-
- @param filename <str> - Optional filename to parse. Otherwise use parseFile or parseStr methods.
- @param encoding <str> - Specifies the document encoding. Default utf-8
-
-- __setstate__(self, state)
- __setstate - Restore state for loading pickle
-
- @param state <dict> - The state
-
-- asHTML = getHTML(self)
-
-- contains(self, em)
- Checks if #em is found anywhere within this element tree
-
-@param em <AdvancedTag> - Tag of interest
-
-@return <bool> - If element #em is within this tree
-
-- containsUid(self, uid)
- Check if #uid is found anywhere within this element tree
-
-@param uid <uuid.UUID> - Uid
-
-@return <bool> - If #uid is found within this tree
-
-- createElement(self, tagName)
- createElement - Create an unattached tag with the given tag name
-
-@param tagName <str> - Name of tag
-
-@return <AdvancedTag> - A tag with the given tag name
-
-- feed(self, contents)
- feed - Feed contents. Use parseStr or parseFile instead.
-
-@param contents - Contents
-
-- filter(self, **kwargs)
- filter aka filterAnd - Filter ALL the elements in this DOM.
-
-Results must match ALL the filter criteria. for ANY, use the *Or methods
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative without QueryableList,
- consider #AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-Special Keys:
-
- tagname - The tag name
- text - The inner text
-
-@return TagCollection<AdvancedTag>
-
-- filterAnd = filter(self, **kwargs)
-
-- filterOr(self, **kwargs)
- filterOr - Perform a filter operation on this node and all children (and their children, onto the end)
-
-Results must match ANY the filter criteria. for ALL, use the *AND methods
-
-For special filter keys, @see #AdvancedHTMLParser.AdvancedHTMLParser.filter
-
-Requires the QueryableList module to be installed (i.e. AdvancedHTMLParser was installed
- without '--no-deps' flag.)
-
-For alternative, consider AdvancedHTMLParser.AdvancedHTMLParser.find method or the getElement* methods
-
-@return TagCollection<AdvancedTag>
-
-- find(self, **kwargs)
- find - Perform a search of elements using attributes as keys and potential values as values
-
- (i.e. parser.find(name='blah', tagname='span') will return all elements in this document
- with the name "blah" of the tag type "span" )
-
-Arguments are key = value, or key can equal a tuple/list of values to match ANY of those values.
-
-Append a key with __contains to test if some strs (or several possible strs) are within an element
-Append a key with __icontains to perform the same __contains op, but ignoring case
-
-Special keys:
-
- tagname - The tag name of the element
- text - The text within an element
-
-NOTE: Empty string means both "not set" and "no value" in this implementation.
-
-NOTE: If you installed the QueryableList module (i.e. ran setup.py without --no-deps) it is
- better to use the "filter"/"filterAnd" or "filterOr" methods, which are also available
- on all tags and tag collections (tag collections also have filterAllAnd and filterAllOr)
-
-
-@return TagCollection<AdvancedTag> - A list of tags that matched the filter criteria
-
-- getAllNodes(self)
- getAllNodes - Get every element
-
-@return TagCollection<AdvancedTag>
-
-- getElementById(self, _id, root='root')
- getElementById - Searches and returns the first (should only be one) element with the given ID.
-
- @param id <str> - A string of the id attribute.
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root' [default], the root of the parsed tree will be used.
-
-- getElementsByAttr(self, attrName, attrValue, root='root')
- getElementsByAttr - Searches the full tree for elements with a given attribute name and value combination. This is always a full scan.
-
- @param attrName <lowercase str> - A lowercase attribute name
- @param attrValue <str> - Expected value of attribute
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
-
-- getElementsByClassName(self, className, root='root')
- getElementsByClassName - Searches and returns all elements containing a given class name.
-
- @param className <str> - One or more space-separated class names
-
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root' [default], the root of the parsed tree will be used.
-
-- getElementsByName(self, name, root='root')
- getElementsByName - Searches and returns all elements with a specific name.
-
- @param name <str> - A string of the name attribute
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root' [default], the root of the parsed tree will be used.
-
-- getElementsByTagName(self, tagName, root='root')
- getElementsByTagName - Searches and returns all elements with a specific tag name.
-
- @param tagName <lowercase str> - A lowercase string of the tag name.
- @param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
-
-- getElementsCustomFilter(self, filterFunc, root='root')
- getElementsCustomFilter - Scan elements using a provided function
-
-@param filterFunc <function>(node) - A function that takes an AdvancedTag as an argument, and returns True if some arbitrary criteria is met
-
-@return - TagCollection of all matching elements
-
-- getElementsWithAttrValues(self, attrName, attrValues, root='root')
- getElementsWithAttrValues - Returns elements with an attribute, named by #attrName contains one of the values in the list, #values
-
-@param attrName <lowercase str> - A lowercase attribute name
-@param attrValues set<str> - A set of all valid values.
-
-
-@return - TagCollection of all matching elements
-
-- getFirstElementCustomFilter(self, filterFunc, root='root')
- getFirstElementCustomFilter - Scan elements using a provided function, stop and return the first match.
-
- @see getElementsCustomFilter to match multiple elements
-
-@param filterFunc <function>(node) - A function that takes an AdvancedTag as an argument, and returns True if some arbitrary criteria is met
-
-@return - An AdvancedTag of the node that matched, or None if no match.
-
-- getFormattedHTML(self, indent=' ')
- getFormattedHTML - Get formatted and xhtml of this document, replacing the original whitespace
- with a pretty-printed version
-
-@param indent - space/tab/newline of each level of indent, or integer for how many spaces per level
-
-@return - <str> Formatted html
-
-@see getHTML - Get HTML with original whitespace
-
-@see getMiniHTML - Get HTML with only functional whitespace remaining
-
-- getHTML(self)
- getHTML - Get the full HTML as contained within this tree.
-
- If parsed from a document, this will contain the original whitespacing.
-
- @returns - <str> of html
-
- @see getFormattedHTML
-
- @see getMiniHTML
-
-- getMiniHTML(self)
- getMiniHTML - Gets the HTML representation of this document without any pretty formatting
- and disregarding original whitespace beyond the functional.
-
- @return <str> - HTML with only functional whitespace present
-
-- getRoot(self)
- getRoot - returns the root Tag.
-
- NOTE: if there are multiple roots, this will be a special tag.
- You may want to consider using getRootNodes instead if this
- is a possible situation for you.
-
-@return AdvancedTag
-
-- getRootNodes(self)
- getRootNodes - Gets all objects at the "root" (first level; no parent). Use this if you may have multiple roots (not children of <html>)
- Use this method to get objects, for example, in an AJAX request where <html> may not be your root.
-
- Note: If there are multiple root nodes (i.e. no <html> at the top), getRoot will return a special tag. This function automatically
- handles that, and returns all root nodes.
-
- @return list<AdvancedTag> - A list of AdvancedTags which are at the root level of the tree.
-
-- handle_charref(self, charRef)
- Internal for parsing
-
-- handle_comment(self, comment)
- Internal for parsing
-
-- handle_data(self, data)
- Internal for parsing
-
-- handle_decl(self, decl)
- Internal for parsing
-
-- handle_entityref(self, entity)
- Internal for parsing
-
-- handle_startendtag(self, tagName, attributeList)
- Internal for parsing
-
-- parseFile(self, filename)
- parseFile - Parses a file and creates the DOM tree and indexes
-
- @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
-
-- parseStr(self, html)
- parseStr - Parses a string and creates the DOM tree and indexes.
-
- @param html <str> - valid HTML
-
-- setDoctype(self, newDoctype)
- setDoctype - Set the doctype for this document, or clear it.
-
- @param newDoctype <str/None> -
-
- If None, will clear the doctype and not return one with #getHTML
-
- Otherwise, a string of the full doctype tag.
-
- For example, the HTML5 doctype would be "DOCTYPE html"
-
-- setRoot(self, root)
- Sets the root node, and reprocesses the indexes
-
-- toHTML = getHTML(self)
-
-- unknown_decl(self, decl)
- Internal for parsing
-
-
-Class methods inherited from AdvancedHTMLParser.Parser.AdvancedHTMLParser:
-- createBlocksFromHTML(html, encoding='utf-8') from builtins.type
- createBlocksFromHTML - Returns the root level node (unless multiple nodes), and
- a list of "blocks" added (text and nodes).
-
-@return list< str/AdvancedTag > - List of blocks created. May be strings (text nodes) or AdvancedTag (tags)
-
-NOTE:
- Results may be checked by:
-
- issubclass(block.__class__, AdvancedTag)
-
- If True, block is a tag, otherwise, it is a text node
-
-- createElementFromHTML(html, encoding='utf-8') from builtins.type
- createElementFromHTML - Creates an element from a string of HTML.
-
- If this could create multiple root-level elements (children are okay),
- you must use #createElementsFromHTML which returns a list of elements created.
-
-@param html <str> - Some html data
-
-@param encoding <str> - Encoding to use for document
-
-@raises MultipleRootNodeException - If given html would produce multiple root-level elements (use #createElementsFromHTML instead)
-
-@return AdvancedTag - A single AdvancedTag
-
-NOTE: If there is text outside the tag, they will be lost in this.
- Use createBlocksFromHTML instead if you need to retain both text and tags.
-
- Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
-
-- createElementsFromHTML(html, encoding='utf-8') from builtins.type
- createElementsFromHTML - Creates elements from provided html, and returns a list of the root-level elements
- children of these root-level nodes are accessable via the usual means.
-
-@param html <str> - Some html data
-
-@param encoding <str> - Encoding to use for document
-
-@return list<AdvancedTag> - The root (top-level) tags from parsed html.
-
-NOTE: If there is text outside the tags, they will be lost in this.
- Use createBlocksFromHTML instead if you need to retain both text and tags.
-
- Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
-
-
-Data descriptors inherited from AdvancedHTMLParser.Parser.AdvancedHTMLParser:
-- body
-- body - Get the body element
-
-@return <AdvancedTag> - The body tag, or None if no body tag present
-
-- forms
-- forms - Return all forms associated with this document
-
-@return <TagCollection> - All "form" elements
-
-- head
-- head - Get the head element
-
-@return <AdvancedTag> - The head tag, or None if no head tag present
-
-
-Methods inherited from html.parser.HTMLParser:
-- check_for_whole_start_tag(self, i)
- # Internal -- check to see if we have a complete starttag; return end
-# or -1 if incomplete.
-
-- clear_cdata_mode(self)
-
-- close(self)
- Handle any buffered data.
-
-- get_starttag_text(self)
- Return full source of start tag: '<...>'.
-
-- goahead(self, end)
- # Internal -- handle data as far as reasonable. May leave state
-# and data to be processed by a subsequent call. If 'end' is
-# true, force handling all data as if followed by EOF marker.
-
-- handle_pi(self, data)
- # Overridable -- handle processing instruction
-
-- parse_bogus_comment(self, i, report=1)
- # Internal -- parse bogus comment, return length or -1 if not terminated
-# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
-
-- parse_endtag(self, i)
- # Internal -- parse endtag, return end or -1 if incomplete
-
-- parse_html_declaration(self, i)
- # Internal -- parse html declarations, return length or -1 if not terminated
-# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
-# See also parse_declaration in _markupbase
-
-- parse_pi(self, i)
- # Internal -- parse processing instr, return end or -1 if not terminated
-
-- parse_starttag(self, i)
- # Internal -- handle starttag, return end or -1 if not terminated
-
-- reset(self)
- Reset this instance. Loses all unprocessed data.
-
-- set_cdata_mode(self, elem)
-
-- unescape(self, s)
- # Internal -- helper to remove special character quoting
-
-
-Data and other attributes inherited from html.parser.HTMLParser:
-- CDATA_CONTENT_ELEMENTS = ('script', 'style')
-
-
-Methods inherited from _markupbase.ParserBase:
-- error(self, message)
-
-- getpos(self)
- Return current line number and offset.
-
-- parse_comment(self, i, report=1)
- # Internal -- parse comment, return length or -1 if not terminated
-
-- parse_declaration(self, i)
- # Internal -- parse declaration (for use by subclasses).
-
-- parse_marked_section(self, i, report=1)
- # Internal -- parse a marked section
-# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
-
-- updatepos(self, i, j)
- # Internal -- update line number and offset. This should be
-# called for each piece of data exactly once, in order -- in other
-# words the concatenation of all the input strings to this
-# function should be exactly the entire input.
-
-
-Data descriptors inherited from _markupbase.ParserBase:
-- __dict__
-- dictionary for instance variables (if defined)
-
-- __weakref__
-- list of weak references to the object (if defined)
-
- | |