class PageElement(object):
"""Contains the navigational information for some part of the page:
that is, its current location in the parse tree.
NavigableString, Tag, etc. are all subclasses of PageElement.
"""
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
other elements.
:param parent: The parent of this element.
:param previous_element: The element parsed immediately before
this one.
:param next_element: The element parsed immediately before
this one.
:param previous_sibling: The most recently encountered element
on the same level of the parse tree as this one.
:param previous_sibling: The next element to be encountered
on the same level of the parse tree as this one.
"""
self.parent = parent
self.previous_element = previous_element
if previous_element is not None:
self.previous_element.next_element = self
self.next_element = next_element
if self.next_element is not None:
self.next_element.previous_element = self
self.next_sibling = next_sibling
if self.next_sibling is not None:
self.next_sibling.previous_sibling = self
if (previous_sibling is None
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
if previous_sibling is not None:
self.previous_sibling.next_sibling = self
class NavigableString(str, PageElement):
"""A Python Unicode string that is part of a parse tree.
When Beautiful Soup parses the markup <b>penguin</b>, it will
create a NavigableString for the string "penguin".
"""
PREFIX = ''
SUFFIX = ''
# We can't tell just by looking at a string whether it's contained
# in an XML document or an HTML document.
known_xml = None
def __new__(cls, value):
"""Create a new NavigableString.
When unpickling a NavigableString, this method is called with
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
if isinstance(value, str):
u = str.__new__(cls, value)
else:
u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
u.setup()
return u
def __copy__(self):
"""A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
"""
return type(self)(self)
def __getnewargs__(self):
return (str(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
compatibility for Navigable*String, but for CData* it lets you
get the string without the CData wrapper."""
if attr == 'string':
return self
else:
raise AttributeError(
"'%s' object has no attribute '%s'" % (
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
"""Run the string through the provided formatter.
:param formatter: A Formatter object, or a string naming one of the standard formatters.
"""
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@property
def name(self):
"""Since a NavigableString is not a Tag, it has no .name.
This property is implemented so that code like this doesn't crash
when run on a mixture of Tag and NavigableString objects:
[x.name for x in tag.children]
"""
return None
@name.setter
def name(self, name):
"""Prevent NavigableString.name from ever being set."""
raise AttributeError("A NavigableString cannot be given a name.")