You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
211 lines
7.0 KiB
211 lines
7.0 KiB
2 years ago
|
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
||
|
# Licensed to PSF under a Contributor Agreement.
|
||
|
|
||
|
# Modifications:
|
||
|
# Copyright David Halter and Contributors
|
||
|
# Modifications are dual-licensed: MIT and PSF.
|
||
|
# 99% of the code is different from pgen2, now.
|
||
|
|
||
|
"""
|
||
|
The ``Parser`` tries to convert the available Python code in an easy to read
|
||
|
format, something like an abstract syntax tree. The classes who represent this
|
||
|
tree, are sitting in the :mod:`parso.tree` module.
|
||
|
|
||
|
The Python module ``tokenize`` is a very important part in the ``Parser``,
|
||
|
because it splits the code into different words (tokens). Sometimes it looks a
|
||
|
bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
|
||
|
module for this? Well, ``ast`` does a very good job understanding proper Python
|
||
|
code, but fails to work as soon as there's a single line of broken code.
|
||
|
|
||
|
There's one important optimization that needs to be known: Statements are not
|
||
|
being parsed completely. ``Statement`` is just a representation of the tokens
|
||
|
within the statement. This lowers memory usage and cpu time and reduces the
|
||
|
complexity of the ``Parser`` (there's another parser sitting inside
|
||
|
``Statement``, which produces ``Array`` and ``Call``).
|
||
|
"""
|
||
|
from typing import Dict, Type
|
||
|
|
||
|
from parso import tree
|
||
|
from parso.pgen2.generator import ReservedString
|
||
|
|
||
|
|
||
|
class ParserSyntaxError(Exception):
|
||
|
"""
|
||
|
Contains error information about the parser tree.
|
||
|
|
||
|
May be raised as an exception.
|
||
|
"""
|
||
|
def __init__(self, message, error_leaf):
|
||
|
self.message = message
|
||
|
self.error_leaf = error_leaf
|
||
|
|
||
|
|
||
|
class InternalParseError(Exception):
|
||
|
"""
|
||
|
Exception to signal the parser is stuck and error recovery didn't help.
|
||
|
Basically this shouldn't happen. It's a sign that something is really
|
||
|
wrong.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, msg, type_, value, start_pos):
|
||
|
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
|
||
|
(msg, type_.name, value, start_pos))
|
||
|
self.msg = msg
|
||
|
self.type = type
|
||
|
self.value = value
|
||
|
self.start_pos = start_pos
|
||
|
|
||
|
|
||
|
class Stack(list):
|
||
|
def _allowed_transition_names_and_token_types(self):
|
||
|
def iterate():
|
||
|
# An API just for Jedi.
|
||
|
for stack_node in reversed(self):
|
||
|
for transition in stack_node.dfa.transitions:
|
||
|
if isinstance(transition, ReservedString):
|
||
|
yield transition.value
|
||
|
else:
|
||
|
yield transition # A token type
|
||
|
|
||
|
if not stack_node.dfa.is_final:
|
||
|
break
|
||
|
|
||
|
return list(iterate())
|
||
|
|
||
|
|
||
|
class StackNode:
|
||
|
def __init__(self, dfa):
|
||
|
self.dfa = dfa
|
||
|
self.nodes = []
|
||
|
|
||
|
@property
|
||
|
def nonterminal(self):
|
||
|
return self.dfa.from_rule
|
||
|
|
||
|
def __repr__(self):
|
||
|
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
|
||
|
|
||
|
|
||
|
def _token_to_transition(grammar, type_, value):
|
||
|
# Map from token to label
|
||
|
if type_.value.contains_syntax:
|
||
|
# Check for reserved words (keywords)
|
||
|
try:
|
||
|
return grammar.reserved_syntax_strings[value]
|
||
|
except KeyError:
|
||
|
pass
|
||
|
|
||
|
return type_
|
||
|
|
||
|
|
||
|
class BaseParser:
|
||
|
"""Parser engine.
|
||
|
|
||
|
A Parser instance contains state pertaining to the current token
|
||
|
sequence, and should not be used concurrently by different threads
|
||
|
to parse separate token sequences.
|
||
|
|
||
|
See python/tokenize.py for how to get input tokens by a string.
|
||
|
|
||
|
When a syntax error occurs, error_recovery() is called.
|
||
|
"""
|
||
|
|
||
|
node_map: Dict[str, Type[tree.BaseNode]] = {}
|
||
|
default_node = tree.Node
|
||
|
|
||
|
leaf_map: Dict[str, Type[tree.Leaf]] = {}
|
||
|
default_leaf = tree.Leaf
|
||
|
|
||
|
def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
|
||
|
self._pgen_grammar = pgen_grammar
|
||
|
self._start_nonterminal = start_nonterminal
|
||
|
self._error_recovery = error_recovery
|
||
|
|
||
|
def parse(self, tokens):
|
||
|
first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
|
||
|
self.stack = Stack([StackNode(first_dfa)])
|
||
|
|
||
|
for token in tokens:
|
||
|
self._add_token(token)
|
||
|
|
||
|
while True:
|
||
|
tos = self.stack[-1]
|
||
|
if not tos.dfa.is_final:
|
||
|
# We never broke out -- EOF is too soon -- Unfinished statement.
|
||
|
# However, the error recovery might have added the token again, if
|
||
|
# the stack is empty, we're fine.
|
||
|
raise InternalParseError(
|
||
|
"incomplete input", token.type, token.string, token.start_pos
|
||
|
)
|
||
|
|
||
|
if len(self.stack) > 1:
|
||
|
self._pop()
|
||
|
else:
|
||
|
return self.convert_node(tos.nonterminal, tos.nodes)
|
||
|
|
||
|
def error_recovery(self, token):
|
||
|
if self._error_recovery:
|
||
|
raise NotImplementedError("Error Recovery is not implemented")
|
||
|
else:
|
||
|
type_, value, start_pos, prefix = token
|
||
|
error_leaf = tree.ErrorLeaf(type_, value, start_pos, prefix)
|
||
|
raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
|
||
|
|
||
|
def convert_node(self, nonterminal, children):
|
||
|
try:
|
||
|
node = self.node_map[nonterminal](children)
|
||
|
except KeyError:
|
||
|
node = self.default_node(nonterminal, children)
|
||
|
return node
|
||
|
|
||
|
def convert_leaf(self, type_, value, prefix, start_pos):
|
||
|
try:
|
||
|
return self.leaf_map[type_](value, start_pos, prefix)
|
||
|
except KeyError:
|
||
|
return self.default_leaf(value, start_pos, prefix)
|
||
|
|
||
|
def _add_token(self, token):
|
||
|
"""
|
||
|
This is the only core function for parsing. Here happens basically
|
||
|
everything. Everything is well prepared by the parser generator and we
|
||
|
only apply the necessary steps here.
|
||
|
"""
|
||
|
grammar = self._pgen_grammar
|
||
|
stack = self.stack
|
||
|
type_, value, start_pos, prefix = token
|
||
|
transition = _token_to_transition(grammar, type_, value)
|
||
|
|
||
|
while True:
|
||
|
try:
|
||
|
plan = stack[-1].dfa.transitions[transition]
|
||
|
break
|
||
|
except KeyError:
|
||
|
if stack[-1].dfa.is_final:
|
||
|
self._pop()
|
||
|
else:
|
||
|
self.error_recovery(token)
|
||
|
return
|
||
|
except IndexError:
|
||
|
raise InternalParseError("too much input", type_, value, start_pos)
|
||
|
|
||
|
stack[-1].dfa = plan.next_dfa
|
||
|
|
||
|
for push in plan.dfa_pushes:
|
||
|
stack.append(StackNode(push))
|
||
|
|
||
|
leaf = self.convert_leaf(type_, value, prefix, start_pos)
|
||
|
stack[-1].nodes.append(leaf)
|
||
|
|
||
|
def _pop(self):
|
||
|
tos = self.stack.pop()
|
||
|
# If there's exactly one child, return that child instead of
|
||
|
# creating a new node. We still create expr_stmt and
|
||
|
# file_input though, because a lot of Jedi depends on its
|
||
|
# logic.
|
||
|
if len(tos.nodes) == 1:
|
||
|
new_node = tos.nodes[0]
|
||
|
else:
|
||
|
new_node = self.convert_node(tos.dfa.from_rule, tos.nodes)
|
||
|
|
||
|
self.stack[-1].nodes.append(new_node)
|