388 lines
15 KiB
388 lines
15 KiB
"""
|
|
This module contains the core classes of version 2.0 of SAX for Python.
|
|
This file provides only default classes with absolutely minimum
|
|
functionality, from which drivers and applications can be subclassed.
|
|
|
|
Many of these classes are empty and are included only as documentation
|
|
of the interfaces.
|
|
|
|
$Id$
|
|
"""
|
|
|
|
version = '2.0beta'
|
|
|
|
#============================================================================
|
|
#
|
|
# HANDLER INTERFACES
|
|
#
|
|
#============================================================================
|
|
|
|
# ===== ERRORHANDLER =====
|
|
|
|
class ErrorHandler:
|
|
"""Basic interface for SAX error handlers.
|
|
|
|
If you create an object that implements this interface, then
|
|
register the object with your XMLReader, the parser will call the
|
|
methods in your object to report all warnings and errors. There
|
|
are three levels of errors available: warnings, (possibly)
|
|
recoverable errors, and unrecoverable errors. All methods take a
|
|
SAXParseException as the only parameter."""
|
|
|
|
def error(self, exception):
|
|
"Handle a recoverable error."
|
|
raise exception
|
|
|
|
def fatalError(self, exception):
|
|
"Handle a non-recoverable error."
|
|
raise exception
|
|
|
|
def warning(self, exception):
|
|
"Handle a warning."
|
|
print(exception)
|
|
|
|
|
|
# ===== CONTENTHANDLER =====
|
|
|
|
class ContentHandler:
|
|
"""Interface for receiving logical document content events.
|
|
|
|
This is the main callback interface in SAX, and the one most
|
|
important to applications. The order of events in this interface
|
|
mirrors the order of the information in the document."""
|
|
|
|
def __init__(self):
|
|
self._locator = None
|
|
|
|
def setDocumentLocator(self, locator):
|
|
"""Called by the parser to give the application a locator for
|
|
locating the origin of document events.
|
|
|
|
SAX parsers are strongly encouraged (though not absolutely
|
|
required) to supply a locator: if it does so, it must supply
|
|
the locator to the application by invoking this method before
|
|
invoking any of the other methods in the DocumentHandler
|
|
interface.
|
|
|
|
The locator allows the application to determine the end
|
|
position of any document-related event, even if the parser is
|
|
not reporting an error. Typically, the application will use
|
|
this information for reporting its own errors (such as
|
|
character content that does not match an application's
|
|
business rules). The information returned by the locator is
|
|
probably not sufficient for use with a search engine.
|
|
|
|
Note that the locator will return correct information only
|
|
during the invocation of the events in this interface. The
|
|
application should not attempt to use it at any other time."""
|
|
self._locator = locator
|
|
|
|
def startDocument(self):
|
|
"""Receive notification of the beginning of a document.
|
|
|
|
The SAX parser will invoke this method only once, before any
|
|
other methods in this interface or in DTDHandler (except for
|
|
setDocumentLocator)."""
|
|
|
|
def endDocument(self):
|
|
"""Receive notification of the end of a document.
|
|
|
|
The SAX parser will invoke this method only once, and it will
|
|
be the last method invoked during the parse. The parser shall
|
|
not invoke this method until it has either abandoned parsing
|
|
(because of an unrecoverable error) or reached the end of
|
|
input."""
|
|
|
|
def startPrefixMapping(self, prefix, uri):
|
|
"""Begin the scope of a prefix-URI Namespace mapping.
|
|
|
|
The information from this event is not necessary for normal
|
|
Namespace processing: the SAX XML reader will automatically
|
|
replace prefixes for element and attribute names when the
|
|
http://xml.org/sax/features/namespaces feature is true (the
|
|
default).
|
|
|
|
There are cases, however, when applications need to use
|
|
prefixes in character data or in attribute values, where they
|
|
cannot safely be expanded automatically; the
|
|
start/endPrefixMapping event supplies the information to the
|
|
application to expand prefixes in those contexts itself, if
|
|
necessary.
|
|
|
|
Note that start/endPrefixMapping events are not guaranteed to
|
|
be properly nested relative to each-other: all
|
|
startPrefixMapping events will occur before the corresponding
|
|
startElement event, and all endPrefixMapping events will occur
|
|
after the corresponding endElement event, but their order is
|
|
not guaranteed."""
|
|
|
|
def endPrefixMapping(self, prefix):
|
|
"""End the scope of a prefix-URI mapping.
|
|
|
|
See startPrefixMapping for details. This event will always
|
|
occur after the corresponding endElement event, but the order
|
|
of endPrefixMapping events is not otherwise guaranteed."""
|
|
|
|
def startElement(self, name, attrs):
|
|
"""Signals the start of an element in non-namespace mode.
|
|
|
|
The name parameter contains the raw XML 1.0 name of the
|
|
element type as a string and the attrs parameter holds an
|
|
instance of the Attributes class containing the attributes of
|
|
the element."""
|
|
|
|
def endElement(self, name):
|
|
"""Signals the end of an element in non-namespace mode.
|
|
|
|
The name parameter contains the name of the element type, just
|
|
as with the startElement event."""
|
|
|
|
def startElementNS(self, name, qname, attrs):
|
|
"""Signals the start of an element in namespace mode.
|
|
|
|
The name parameter contains the name of the element type as a
|
|
(uri, localname) tuple, the qname parameter the raw XML 1.0
|
|
name used in the source document, and the attrs parameter
|
|
holds an instance of the Attributes class containing the
|
|
attributes of the element.
|
|
|
|
The uri part of the name tuple is None for elements which have
|
|
no namespace."""
|
|
|
|
def endElementNS(self, name, qname):
|
|
"""Signals the end of an element in namespace mode.
|
|
|
|
The name parameter contains the name of the element type, just
|
|
as with the startElementNS event."""
|
|
|
|
def characters(self, content):
|
|
"""Receive notification of character data.
|
|
|
|
The Parser will call this method to report each chunk of
|
|
character data. SAX parsers may return all contiguous
|
|
character data in a single chunk, or they may split it into
|
|
several chunks; however, all of the characters in any single
|
|
event must come from the same external entity so that the
|
|
Locator provides useful information."""
|
|
|
|
def ignorableWhitespace(self, whitespace):
|
|
"""Receive notification of ignorable whitespace in element content.
|
|
|
|
Validating Parsers must use this method to report each chunk
|
|
of ignorable whitespace (see the W3C XML 1.0 recommendation,
|
|
section 2.10): non-validating parsers may also use this method
|
|
if they are capable of parsing and using content models.
|
|
|
|
SAX parsers may return all contiguous whitespace in a single
|
|
chunk, or they may split it into several chunks; however, all
|
|
of the characters in any single event must come from the same
|
|
external entity, so that the Locator provides useful
|
|
information."""
|
|
|
|
def processingInstruction(self, target, data):
|
|
"""Receive notification of a processing instruction.
|
|
|
|
The Parser will invoke this method once for each processing
|
|
instruction found: note that processing instructions may occur
|
|
before or after the main document element.
|
|
|
|
A SAX parser should never report an XML declaration (XML 1.0,
|
|
section 2.8) or a text declaration (XML 1.0, section 4.3.1)
|
|
using this method."""
|
|
|
|
def skippedEntity(self, name):
|
|
"""Receive notification of a skipped entity.
|
|
|
|
The Parser will invoke this method once for each entity
|
|
skipped. Non-validating processors may skip entities if they
|
|
have not seen the declarations (because, for example, the
|
|
entity was declared in an external DTD subset). All processors
|
|
may skip external entities, depending on the values of the
|
|
http://xml.org/sax/features/external-general-entities and the
|
|
http://xml.org/sax/features/external-parameter-entities
|
|
properties."""
|
|
|
|
|
|
# ===== DTDHandler =====
|
|
|
|
class DTDHandler:
|
|
"""Handle DTD events.
|
|
|
|
This interface specifies only those DTD events required for basic
|
|
parsing (unparsed entities and attributes)."""
|
|
|
|
def notationDecl(self, name, publicId, systemId):
|
|
"Handle a notation declaration event."
|
|
|
|
def unparsedEntityDecl(self, name, publicId, systemId, ndata):
|
|
"Handle an unparsed entity declaration event."
|
|
|
|
|
|
# ===== ENTITYRESOLVER =====
|
|
|
|
class EntityResolver:
|
|
"""Basic interface for resolving entities. If you create an object
|
|
implementing this interface, then register the object with your
|
|
Parser, the parser will call the method in your object to
|
|
resolve all external entities. Note that DefaultHandler implements
|
|
this interface with the default behaviour."""
|
|
|
|
def resolveEntity(self, publicId, systemId):
|
|
"""Resolve the system identifier of an entity and return either
|
|
the system identifier to read from as a string, or an InputSource
|
|
to read from."""
|
|
return systemId
|
|
|
|
|
|
#============================================================================
|
|
#
|
|
# CORE FEATURES
|
|
#
|
|
#============================================================================
|
|
|
|
feature_namespaces = "http://xml.org/sax/features/namespaces"
|
|
# true: Perform Namespace processing (default).
|
|
# false: Optionally do not perform Namespace processing
|
|
# (implies namespace-prefixes).
|
|
# access: (parsing) read-only; (not parsing) read/write
|
|
|
|
feature_namespace_prefixes = "http://xml.org/sax/features/namespace-prefixes"
|
|
# true: Report the original prefixed names and attributes used for Namespace
|
|
# declarations.
|
|
# false: Do not report attributes used for Namespace declarations, and
|
|
# optionally do not report original prefixed names (default).
|
|
# access: (parsing) read-only; (not parsing) read/write
|
|
|
|
feature_string_interning = "http://xml.org/sax/features/string-interning"
|
|
# true: All element names, prefixes, attribute names, Namespace URIs, and
|
|
# local names are interned using the built-in intern function.
|
|
# false: Names are not necessarily interned, although they may be (default).
|
|
# access: (parsing) read-only; (not parsing) read/write
|
|
|
|
feature_validation = "http://xml.org/sax/features/validation"
|
|
# true: Report all validation errors (implies external-general-entities and
|
|
# external-parameter-entities).
|
|
# false: Do not report validation errors.
|
|
# access: (parsing) read-only; (not parsing) read/write
|
|
|
|
feature_external_ges = "http://xml.org/sax/features/external-general-entities"
|
|
# true: Include all external general (text) entities.
|
|
# false: Do not include external general entities.
|
|
# access: (parsing) read-only; (not parsing) read/write
|
|
|
|
feature_external_pes = "http://xml.org/sax/features/external-parameter-entities"
|
|
# true: Include all external parameter entities, including the external
|
|
# DTD subset.
|
|
# false: Do not include any external parameter entities, even the external
|
|
# DTD subset.
|
|
# access: (parsing) read-only; (not parsing) read/write
|
|
|
|
all_features = [feature_namespaces,
|
|
feature_namespace_prefixes,
|
|
feature_string_interning,
|
|
feature_validation,
|
|
feature_external_ges,
|
|
feature_external_pes]
|
|
|
|
|
|
#============================================================================
|
|
#
|
|
# CORE PROPERTIES
|
|
#
|
|
#============================================================================
|
|
|
|
property_lexical_handler = "http://xml.org/sax/properties/lexical-handler"
|
|
# data type: xml.sax.sax2lib.LexicalHandler
|
|
# description: An optional extension handler for lexical events like comments.
|
|
# access: read/write
|
|
|
|
property_declaration_handler = "http://xml.org/sax/properties/declaration-handler"
|
|
# data type: xml.sax.sax2lib.DeclHandler
|
|
# description: An optional extension handler for DTD-related events other
|
|
# than notations and unparsed entities.
|
|
# access: read/write
|
|
|
|
property_dom_node = "http://xml.org/sax/properties/dom-node"
|
|
# data type: org.w3c.dom.Node
|
|
# description: When parsing, the current DOM node being visited if this is
|
|
# a DOM iterator; when not parsing, the root DOM node for
|
|
# iteration.
|
|
# access: (parsing) read-only; (not parsing) read/write
|
|
|
|
property_xml_string = "http://xml.org/sax/properties/xml-string"
|
|
# data type: String
|
|
# description: The literal string of characters that was the source for
|
|
# the current event.
|
|
# access: read-only
|
|
|
|
property_encoding = "http://www.python.org/sax/properties/encoding"
|
|
# data type: String
|
|
# description: The name of the encoding to assume for input data.
|
|
# access: write: set the encoding, e.g. established by a higher-level
|
|
# protocol. May change during parsing (e.g. after
|
|
# processing a META tag)
|
|
# read: return the current encoding (possibly established through
|
|
# auto-detection.
|
|
# initial value: UTF-8
|
|
#
|
|
|
|
property_interning_dict = "http://www.python.org/sax/properties/interning-dict"
|
|
# data type: Dictionary
|
|
# description: The dictionary used to intern common strings in the document
|
|
# access: write: Request that the parser uses a specific dictionary, to
|
|
# allow interning across different documents
|
|
# read: return the current interning dictionary, or None
|
|
#
|
|
|
|
all_properties = [property_lexical_handler,
|
|
property_dom_node,
|
|
property_declaration_handler,
|
|
property_xml_string,
|
|
property_encoding,
|
|
property_interning_dict]
|
|
|
|
|
|
class LexicalHandler:
|
|
"""Optional SAX2 handler for lexical events.
|
|
|
|
This handler is used to obtain lexical information about an XML
|
|
document, that is, information about how the document was encoded
|
|
(as opposed to what it contains, which is reported to the
|
|
ContentHandler), such as comments and CDATA marked section
|
|
boundaries.
|
|
|
|
To set the LexicalHandler of an XMLReader, use the setProperty
|
|
method with the property identifier
|
|
'http://xml.org/sax/properties/lexical-handler'."""
|
|
|
|
def comment(self, content):
|
|
"""Reports a comment anywhere in the document (including the
|
|
DTD and outside the document element).
|
|
|
|
content is a string that holds the contents of the comment."""
|
|
|
|
def startDTD(self, name, public_id, system_id):
|
|
"""Report the start of the DTD declarations, if the document
|
|
has an associated DTD.
|
|
|
|
A startEntity event will be reported before declaration events
|
|
from the external DTD subset are reported, and this can be
|
|
used to infer from which subset DTD declarations derive.
|
|
|
|
name is the name of the document element type, public_id the
|
|
public identifier of the DTD (or None if none were supplied)
|
|
and system_id the system identfier of the external subset (or
|
|
None if none were supplied)."""
|
|
|
|
def endDTD(self):
|
|
"""Signals the end of DTD declarations."""
|
|
|
|
def startCDATA(self):
|
|
"""Reports the beginning of a CDATA marked section.
|
|
|
|
The contents of the CDATA marked section will be reported
|
|
through the characters event."""
|
|
|
|
def endCDATA(self):
|
|
"""Reports the end of a CDATA marked section."""
|