186 lines
8.1 KiB
Python
186 lines
8.1 KiB
Python
# This file is part of Patsy
|
|
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
|
|
# See file LICENSE.txt for license information.
|
|
|
|
# Utilities for dealing with Python code at the token level.
|
|
#
|
|
# Includes:
|
|
# a "pretty printer" that converts a sequence of tokens back into a
|
|
# readable, white-space normalized string.
|
|
# a utility function to replace calls to global functions with calls to
|
|
# other functions
|
|
|
|
import tokenize
|
|
from six.moves import cStringIO as StringIO
|
|
|
|
from patsy import PatsyError
|
|
from patsy.origin import Origin
|
|
|
|
__all__ = ["python_tokenize", "pretty_untokenize",
|
|
"normalize_token_spacing"]
|
|
|
|
# A convenience wrapper around tokenize.generate_tokens. yields tuples
|
|
# (tokenize type, token string, origin object)
|
|
def python_tokenize(code):
|
|
# Since formulas can only contain Python expressions, and Python
|
|
# expressions cannot meaningfully contain newlines, we'll just remove all
|
|
# the newlines up front to avoid any complications:
|
|
code = code.replace("\n", " ").strip()
|
|
it = tokenize.generate_tokens(StringIO(code).readline)
|
|
try:
|
|
for (pytype, string, (_, start), (_, end), code) in it:
|
|
if pytype == tokenize.ENDMARKER:
|
|
break
|
|
if pytype in (tokenize.NL, tokenize.NEWLINE):
|
|
assert string == ""
|
|
continue
|
|
origin = Origin(code, start, end)
|
|
if pytype == tokenize.ERRORTOKEN:
|
|
raise PatsyError("error tokenizing input "
|
|
"(maybe an unclosed string?)",
|
|
origin)
|
|
if pytype == tokenize.COMMENT:
|
|
raise PatsyError("comments are not allowed", origin)
|
|
yield (pytype, string, origin)
|
|
else: # pragma: no cover
|
|
raise ValueError("stream ended without ENDMARKER?!?")
|
|
except tokenize.TokenError as e:
|
|
# TokenError is raised iff the tokenizer thinks that there is
|
|
# some sort of multi-line construct in progress (e.g., an
|
|
# unclosed parentheses, which in Python lets a virtual line
|
|
# continue past the end of the physical line), and it hits the
|
|
# end of the source text. We have our own error handling for
|
|
# such cases, so just treat this as an end-of-stream.
|
|
#
|
|
if "unterminated string literal" in e.args[0]:
|
|
raise PatsyError(
|
|
"error tokenizing input ({})".format(e.args[0]),
|
|
Origin(code, 0, len(code)),
|
|
)
|
|
|
|
# Just in case someone adds some other error case:
|
|
assert "EOF in multi-line" in e.args[0]
|
|
return
|
|
|
|
def test_python_tokenize():
|
|
code = "a + (foo * -1)"
|
|
tokens = list(python_tokenize(code))
|
|
expected = [(tokenize.NAME, "a", Origin(code, 0, 1)),
|
|
(tokenize.OP, "+", Origin(code, 2, 3)),
|
|
(tokenize.OP, "(", Origin(code, 4, 5)),
|
|
(tokenize.NAME, "foo", Origin(code, 5, 8)),
|
|
(tokenize.OP, "*", Origin(code, 9, 10)),
|
|
(tokenize.OP, "-", Origin(code, 11, 12)),
|
|
(tokenize.NUMBER, "1", Origin(code, 12, 13)),
|
|
(tokenize.OP, ")", Origin(code, 13, 14))]
|
|
assert tokens == expected
|
|
|
|
code2 = "a + (b"
|
|
tokens2 = list(python_tokenize(code2))
|
|
expected2 = [(tokenize.NAME, "a", Origin(code2, 0, 1)),
|
|
(tokenize.OP, "+", Origin(code2, 2, 3)),
|
|
(tokenize.OP, "(", Origin(code2, 4, 5)),
|
|
(tokenize.NAME, "b", Origin(code2, 5, 6))]
|
|
assert tokens2 == expected2
|
|
|
|
import pytest
|
|
pytest.raises(PatsyError, list, python_tokenize("a b # c"))
|
|
|
|
import pytest
|
|
pytest.raises(PatsyError, list, python_tokenize("a b \"c"))
|
|
|
|
_python_space_both = (list("+-*/%&^|<>")
|
|
+ ["==", "<>", "!=", "<=", ">=",
|
|
"<<", ">>", "**", "//"])
|
|
_python_space_before = (_python_space_both
|
|
+ ["!", "~"])
|
|
_python_space_after = (_python_space_both
|
|
+ [",", ":"])
|
|
|
|
def pretty_untokenize(typed_tokens):
|
|
text = []
|
|
prev_was_space_delim = False
|
|
prev_wants_space = False
|
|
prev_was_open_paren_or_comma = False
|
|
prev_was_object_like = False
|
|
brackets = []
|
|
for token_type, token in typed_tokens:
|
|
assert token_type not in (tokenize.INDENT, tokenize.DEDENT,
|
|
tokenize.NL)
|
|
if token_type == tokenize.NEWLINE:
|
|
continue
|
|
if token_type == tokenize.ENDMARKER:
|
|
continue
|
|
if token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING):
|
|
if prev_wants_space or prev_was_space_delim:
|
|
text.append(" ")
|
|
text.append(token)
|
|
prev_wants_space = False
|
|
prev_was_space_delim = True
|
|
else:
|
|
if token in ("(", "[", "{"):
|
|
brackets.append(token)
|
|
elif brackets and token in (")", "]", "}"):
|
|
brackets.pop()
|
|
this_wants_space_before = (token in _python_space_before)
|
|
this_wants_space_after = (token in _python_space_after)
|
|
# Special case for slice syntax: foo[:10]
|
|
# Otherwise ":" is spaced after, like: "{1: ...}", "if a: ..."
|
|
if token == ":" and brackets and brackets[-1] == "[":
|
|
this_wants_space_after = False
|
|
# Special case for foo(*args), foo(a, *args):
|
|
if token in ("*", "**") and prev_was_open_paren_or_comma:
|
|
this_wants_space_before = False
|
|
this_wants_space_after = False
|
|
# Special case for "a = foo(b=1)":
|
|
if token == "=" and not brackets:
|
|
this_wants_space_before = True
|
|
this_wants_space_after = True
|
|
# Special case for unary -, +. Our heuristic is that if we see the
|
|
# + or - after something that looks like an object (a NAME,
|
|
# NUMBER, STRING, or close paren) then it is probably binary,
|
|
# otherwise it is probably unary.
|
|
if token in ("+", "-") and not prev_was_object_like:
|
|
this_wants_space_before = False
|
|
this_wants_space_after = False
|
|
if prev_wants_space or this_wants_space_before:
|
|
text.append(" ")
|
|
text.append(token)
|
|
prev_wants_space = this_wants_space_after
|
|
prev_was_space_delim = False
|
|
if (token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING)
|
|
or token == ")"):
|
|
prev_was_object_like = True
|
|
else:
|
|
prev_was_object_like = False
|
|
prev_was_open_paren_or_comma = token in ("(", ",")
|
|
return "".join(text)
|
|
|
|
def normalize_token_spacing(code):
|
|
tokens = [(t[0], t[1])
|
|
for t in tokenize.generate_tokens(StringIO(code).readline)]
|
|
return pretty_untokenize(tokens)
|
|
|
|
def test_pretty_untokenize_and_normalize_token_spacing():
|
|
assert normalize_token_spacing("1 + 1") == "1 + 1"
|
|
assert normalize_token_spacing("1+1") == "1 + 1"
|
|
assert normalize_token_spacing("1*(2+3**2)") == "1 * (2 + 3 ** 2)"
|
|
assert normalize_token_spacing("a and b") == "a and b"
|
|
assert normalize_token_spacing("foo(a=bar.baz[1:])") == "foo(a=bar.baz[1:])"
|
|
assert normalize_token_spacing("""{"hi":foo[:]}""") == """{"hi": foo[:]}"""
|
|
assert normalize_token_spacing("""'a' "b" 'c'""") == """'a' "b" 'c'"""
|
|
assert normalize_token_spacing('"""a""" is 1 or 2==3') == '"""a""" is 1 or 2 == 3'
|
|
assert normalize_token_spacing("foo ( * args )") == "foo(*args)"
|
|
assert normalize_token_spacing("foo ( a * args )") == "foo(a * args)"
|
|
assert normalize_token_spacing("foo ( ** args )") == "foo(**args)"
|
|
assert normalize_token_spacing("foo ( a ** args )") == "foo(a ** args)"
|
|
assert normalize_token_spacing("foo (1, * args )") == "foo(1, *args)"
|
|
assert normalize_token_spacing("foo (1, a * args )") == "foo(1, a * args)"
|
|
assert normalize_token_spacing("foo (1, ** args )") == "foo(1, **args)"
|
|
assert normalize_token_spacing("foo (1, a ** args )") == "foo(1, a ** args)"
|
|
|
|
assert normalize_token_spacing("a=foo(b = 1)") == "a = foo(b=1)"
|
|
|
|
assert normalize_token_spacing("foo(+ 10, bar = - 1)") == "foo(+10, bar=-1)"
|
|
assert normalize_token_spacing("1 + +10 + -1 - 5") == "1 + +10 + -1 - 5"
|