Issue #24619: Simplify async/await tokenization.

This commit simplifies async/await tokenization in tokenizer.c, tokenize.py & lib2to3/tokenize.py. Previous solution was to keep a stack of async-def & def blocks, whereas the new approach is just to remember position of the outermost async-def block. This change won't bring any parsing performance improvements, but it makes the code much easier to read and validate.
author: Yury Selivanov <yselivanov@sprymix.com> 2015-07-23 15:01:58 +0300
committer: Yury Selivanov <yselivanov@sprymix.com> 2015-07-23 15:01:58 +0300
commit: 96ec934e755355cfc5af036db8641646b7ddb45e (patch)
tree: a6fd6a4cbef1b75ab0cc10db01fd91ecf2e99976
parent: Issue #24687: Plug refleak on SyntaxError in function parameters annotations. (diff)
download: cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.gz
cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.bz2
cpython-96ec934e755355cfc5af036db8641646b7ddb45e.zip
7 files changed, 183 insertions, 132 deletions
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index 896b0fa0ad4..1ff1c61ee22 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -366,10 +366,11 @@ def generate_tokens(readline):
     contline = None
     indents = [0]
 
-    # 'stashed' and 'ctx' are used for async/await parsing
+    # 'stashed' and 'async_*' are used for async/await parsing
     stashed = None
-    ctx = [('sync', 0)]
-    in_async = 0
+    async_def = False
+    async_def_indent = 0
+    async_def_nl = False
 
     while 1:                                   # loop over lines in stream
         try:
@@ -438,15 +439,18 @@ def generate_tokens(readline):
                         ("<tokenize>", lnum, pos, line))
                 indents = indents[:-1]
 
-                cur_indent = indents[-1]
-                while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
-                    if ctx[-1][0] == 'async':
-                        in_async -= 1
-                        assert in_async >= 0
-                    ctx.pop()
+                if async_def and async_def_indent >= indents[-1]:
+                    async_def = False
+                    async_def_nl = False
+                    async_def_indent = 0
 
                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 
+            if async_def and async_def_nl and async_def_indent >= indents[-1]:
+                async_def = False
+                async_def_nl = False
+                async_def_indent = 0
+
         else:                                  # continued statement
             if not line:
                 raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -466,10 +470,13 @@ def generate_tokens(readline):
                     newline = NEWLINE
                     if parenlev > 0:
                         newline = NL
+                    elif async_def:
+                        async_def_nl = True
                     if stashed:
                         yield stashed
                         stashed = None
                     yield (newline, token, spos, epos, line)
+
                 elif initial == '#':
                     assert not token.endswith("\n")
                     if stashed:
@@ -508,7 +515,7 @@ def generate_tokens(readline):
                         yield (STRING, token, spos, epos, line)
                 elif initial in namechars:                 # ordinary name
                     if token in ('async', 'await'):
-                        if in_async:
+                        if async_def:
                             yield (ASYNC if token == 'async' else AWAIT,
                                    token, spos, epos, line)
                             continue
@@ -523,15 +530,13 @@ def generate_tokens(readline):
                                 and stashed[0] == NAME
                                 and stashed[1] == 'async'):
 
-                            ctx.append(('async', indents[-1]))
-                            in_async += 1
+                            async_def = True
+                            async_def_indent = indents[-1]
 
                             yield (ASYNC, stashed[1],
                                    stashed[2], stashed[3],
                                    stashed[4])
                             stashed = None
-                        else:
-                            ctx.append(('sync', indents[-1]))
 
                     if stashed:
                         yield stashed
diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py
index 107b5ab68b8..b533c01e28f 100644
--- a/Lib/lib2to3/tests/test_parser.py
+++ b/Lib/lib2to3/tests/test_parser.py
@@ -67,10 +67,32 @@ class TestAsyncAwait(GrammarTest):
                              await x
                       """)
 
+        self.validate("""async def foo():
+
+            def foo(): pass
+
+            def foo(): pass
+
+            await x
+        """)
+
+        self.validate("""async def foo(): return await a""")
+
+        self.validate("""def foo():
+            def foo(): pass
+            async def foo(): await x
+        """)
+
         self.invalid_syntax("await x")
         self.invalid_syntax("""def foo():
                                    await x""")
 
+        self.invalid_syntax("""def foo():
+            def foo(): pass
+            async def foo(): pass
+            await x
+        """)
+
     def test_async_var(self):
         self.validate("""async = 1""")
         self.validate("""await = 1""")
diff --git a/Lib/test/test_coroutines.py b/Lib/test/test_coroutines.py
index 14682ca6047..10de85644ee 100644
--- a/Lib/test/test_coroutines.py
+++ b/Lib/test/test_coroutines.py
@@ -330,6 +330,7 @@ class AsyncBadSyntaxTest(unittest.TestCase):
         async def f():
             async def g(): pass
             await z
+        await = 1
         self.assertTrue(inspect.iscoroutinefunction(f))
 
 
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index e3205628818..b7ca08949a3 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -840,6 +840,79 @@ Async/await extension:
     OP         ')'           (1, 19) (1, 20)
     OP         ':'           (1, 20) (1, 21)
     AWAIT      'await'       (1, 22) (1, 27)
+
+    >>> dump_tokens('''def f():
+    ...
+    ...   def baz(): pass
+    ...   async def bar(): pass
+    ...
+    ...   await = 2''')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'f'           (1, 4) (1, 5)
+    OP         '('           (1, 5) (1, 6)
+    OP         ')'           (1, 6) (1, 7)
+    OP         ':'           (1, 7) (1, 8)
+    NEWLINE    '\\n'          (1, 8) (1, 9)
+    NL         '\\n'          (2, 0) (2, 1)
+    INDENT     '  '          (3, 0) (3, 2)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    OP         '('           (3, 9) (3, 10)
+    OP         ')'           (3, 10) (3, 11)
+    OP         ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    '\\n'          (3, 17) (3, 18)
+    ASYNC      'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    OP         '('           (4, 15) (4, 16)
+    OP         ')'           (4, 16) (4, 17)
+    OP         ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    '\\n'          (4, 23) (4, 24)
+    NL         '\\n'          (5, 0) (5, 1)
+    NAME       'await'       (6, 2) (6, 7)
+    OP         '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (7, 0) (7, 0)
+
+    >>> dump_tokens('''async def f():
+    ...
+    ...   def baz(): pass
+    ...   async def bar(): pass
+    ...
+    ...   await = 2''')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'f'           (1, 10) (1, 11)
+    OP         '('           (1, 11) (1, 12)
+    OP         ')'           (1, 12) (1, 13)
+    OP         ':'           (1, 13) (1, 14)
+    NEWLINE    '\\n'          (1, 14) (1, 15)
+    NL         '\\n'          (2, 0) (2, 1)
+    INDENT     '  '          (3, 0) (3, 2)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    OP         '('           (3, 9) (3, 10)
+    OP         ')'           (3, 10) (3, 11)
+    OP         ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    '\\n'          (3, 17) (3, 18)
+    ASYNC      'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    OP         '('           (4, 15) (4, 16)
+    OP         ')'           (4, 16) (4, 17)
+    OP         ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    '\\n'          (4, 23) (4, 24)
+    NL         '\\n'          (5, 0) (5, 1)
+    AWAIT      'await'       (6, 2) (6, 7)
+    OP         '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (7, 0) (7, 0)
 """
 
 from test import support
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index c3efdda528d..65d06e53f3b 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -498,10 +498,11 @@ def _tokenize(readline, encoding):
     contline = None
     indents = [0]
 
-    # 'stashed' and 'ctx' are used for async/await parsing
+    # 'stashed' and 'async_*' are used for async/await parsing
     stashed = None
-    ctx = [('sync', 0)]
-    in_async = 0
+    async_def = False
+    async_def_indent = 0
+    async_def_nl = False
 
     if encoding is not None:
         if encoding == "utf-8-sig":
@@ -579,15 +580,18 @@ def _tokenize(readline, encoding):
                         ("<tokenize>", lnum, pos, line))
                 indents = indents[:-1]
 
-                cur_indent = indents[-1]
-                while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
-                    if ctx[-1][0] == 'async':
-                        in_async -= 1
-                        assert in_async >= 0
-                    ctx.pop()
+                if async_def and async_def_indent >= indents[-1]:
+                    async_def = False
+                    async_def_nl = False
+                    async_def_indent = 0
 
                 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
 
+            if async_def and async_def_nl and async_def_indent >= indents[-1]:
+                async_def = False
+                async_def_nl = False
+                async_def_indent = 0
+
         else:                                  # continued statement
             if not line:
                 raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -609,8 +613,13 @@ def _tokenize(readline, encoding):
                     if stashed:
                         yield stashed
                         stashed = None
-                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
-                           token, spos, epos, line)
+                    if parenlev > 0:
+                        yield TokenInfo(NL, token, spos, epos, line)
+                    else:
+                        yield TokenInfo(NEWLINE, token, spos, epos, line)
+                        if async_def:
+                            async_def_nl = True
+
                 elif initial == '#':
                     assert not token.endswith("\n")
                     if stashed:
@@ -644,7 +653,7 @@ def _tokenize(readline, encoding):
                         yield TokenInfo(STRING, token, spos, epos, line)
                 elif initial.isidentifier():               # ordinary name
                     if token in ('async', 'await'):
-                        if in_async:
+                        if async_def:
                             yield TokenInfo(
                                 ASYNC if token == 'async' else AWAIT,
                                 token, spos, epos, line)
@@ -660,15 +669,13 @@ def _tokenize(readline, encoding):
                                 and stashed.type == NAME
                                 and stashed.string == 'async'):
 
-                            ctx.append(('async', indents[-1]))
-                            in_async += 1
+                            async_def = True
+                            async_def_indent = indents[-1]
 
                             yield TokenInfo(ASYNC, stashed.string,
                                             stashed.start, stashed.end,
                                             stashed.line)
                             stashed = None
-                        else:
-                            ctx.append(('sync', indents[-1]))
 
                     if stashed:
                         yield stashed
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 46c058083f8..04baeaf38ad 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -31,12 +31,6 @@
                || c == '_'\
                || (c >= 128))
 
-/* The following DEFTYPE* flags are used in 'tok_state->deftypestack',
-   and should be removed in 3.7, when async/await are regular
-   keywords. */
-#define DEFTYPE_ASYNC           1
-#define DEFTYPE_HAS_NL          2
-
 extern char *PyOS_Readline(FILE *, FILE *, const char *);
 /* Return malloc'ed string including trailing \n;
    empty malloc'ed string for EOF;
@@ -133,12 +127,6 @@ tok_new(void)
     tok->indent = 0;
     tok->indstack[0] = 0;
 
-    tok->def = 0;
-    tok->defstack[0] = 0;
-    tok->deftypestack[0] = 0;
-    tok->def_async_behind = 0;
-    tok->def_in_async = 0;
-
     tok->atbol = 1;
     tok->pendin = 0;
     tok->prompt = tok->nextprompt = NULL;
@@ -159,6 +147,11 @@ tok_new(void)
     tok->decoding_readline = NULL;
     tok->decoding_buffer = NULL;
 #endif
+
+    tok->async_def = 0;
+    tok->async_def_indent = 0;
+    tok->async_def_nl = 0;
+
     return tok;
 }
 
@@ -1350,11 +1343,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
     int c;
     int blankline, nonascii;
 
-    int tok_len;
-    struct tok_state ahead_tok;
-    char *ahead_tok_start = NULL, *ahead_top_end = NULL;
-    int ahead_tok_kind;
-
     *p_start = *p_end = NULL;
   nextline:
     tok->start = NULL;
@@ -1442,16 +1430,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
     if (tok->pendin != 0) {
         if (tok->pendin < 0) {
             tok->pendin++;
-
-            while (tok->def && tok->defstack[tok->def] >= tok->indent) {
-                if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
-                    tok->def_in_async--;
-                    assert(tok->def_in_async >= 0);
-                }
-                tok->def--;
-                assert(tok->def >= 0);
-            }
-
             return DEDENT;
         }
         else {
@@ -1460,20 +1438,19 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         }
     }
 
-    if (!blankline && tok->level == 0
-        && tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL
-        && tok->defstack[tok->def] >= tok->indent)
+    if (tok->async_def
+        && !blankline
+        && tok->level == 0
+        /* There was a NEWLINE after ASYNC DEF,
+           so we're past the signature. */
+        && tok->async_def_nl
+        /* Current indentation level is less than where
+           the async function was defined */
+        && tok->async_def_indent >= tok->indent)
     {
-        /* The top function on the stack did have a NEWLINE
-        token, but didn't have an INDENT.  That means that
-        it's a one-line function and it should now be removed
-        from the stack. */
-        if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
-            tok->def_in_async--;
-            assert(tok->def_in_async >= 0);
-        }
-        tok->def--;
-        assert(tok->def >= 0);
+        tok->async_def = 0;
+        tok->async_def_indent = 0;
+        tok->async_def_nl = 0;
     }
 
  again:
@@ -1528,38 +1505,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         *p_start = tok->start;
         *p_end = tok->cur;
 
-        tok_len = tok->cur - tok->start;
-        if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) {
-            /* The current token is 'def'. */
-            if (tok->def + 1 >= MAXINDENT) {
-                tok->done = E_TOODEEP;
-                tok->cur = tok->inp;
-                return ERRORTOKEN;
+        /* async/await parsing block. */
+        if (tok->cur - tok->start == 5) {
+            /* Current token length is 5. */
+            if (tok->async_def) {
+                /* We're inside an 'async def' function. */
+                if (memcmp(tok->start, "async", 5) == 0)
+                    return ASYNC;
+                if (memcmp(tok->start, "await", 5) == 0)
+                    return AWAIT;
             }
+            else if (memcmp(tok->start, "async", 5) == 0) {
+                /* The current token is 'async'.
+                   Look ahead one token.*/
 
-            /* Advance defs stack. */
-            tok->def++;
-            tok->defstack[tok->def] = tok->indent;
+                struct tok_state ahead_tok;
+                char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
+                int ahead_tok_kind;
 
-            if (tok->def_async_behind) {
-                /* The previous token was 'async'. */
-                tok->def_async_behind = 0;
-                tok->deftypestack[tok->def] = DEFTYPE_ASYNC;
-                tok->def_in_async++;
-            }
-            else {
-                /* This is a regular function (not async def). */
-                tok->deftypestack[tok->def] = 0;
-            }
-        }
-        else if (tok_len == 5) {
-            if (memcmp(tok->start, "async", 5) == 0) {
-                /* The current token is 'async'. */
                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
-
-                /* Try to look ahead one token. */
                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
-                                         &ahead_top_end);
+                                         &ahead_tok_end);
 
                 if (ahead_tok_kind == NAME
                     && ahead_tok.cur - ahead_tok.start == 3
@@ -1567,22 +1533,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                 {
                     /* The next token is going to be 'def', so instead of
                        returning 'async' NAME token, we return ASYNC. */
-                    tok->def_async_behind = 1;
+                    tok->async_def_indent = tok->indent;
+                    tok->async_def = 1;
                     return ASYNC;
                 }
-                else if (tok->def_in_async)
-                {
-                    /* We're inside an 'async def' function, so we treat
-                    'async' token as ASYNC, instead of NAME. */
-                    return ASYNC;
-                }
-
-            }
-            else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async)
-            {
-                /* We're inside an 'async def' function, so we treat
-                'await' token as AWAIT, instead of NAME. */
-                return AWAIT;
             }
         }
 
@@ -1597,12 +1551,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         *p_start = tok->start;
         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
         tok->cont_line = 0;
-        if (tok->def) {
-            /* Mark the top function on the stack that it had
-               at least one NEWLINE.  That will help us to
-               distinguish one-line functions from functions
-               with multiple statements. */
-            tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL;
+        if (tok->async_def) {
+            /* We're somewhere inside an 'async def' function, and
+               we've encountered a NEWLINE after its signature. */
+            tok->async_def_nl = 1;
         }
         return NEWLINE;
     }
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index e198a0b6f55..af053e250a3 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -66,21 +66,12 @@ struct tok_state {
     const char* str;
     const char* input; /* Tokenizer's newline translated copy of the string. */
 
-    /* `def*` fields are for parsing async/await in a backwards compatible
-       way.  They should be removed in 3.7, when they will become
-       regular constants.  See PEP 492 for more details. */
-    int defstack[MAXINDENT];     /* Stack of funcs & indents where they
-                                    were defined. */
-    int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_*
-                                    constants. */
-    int def;                     /* Length of stack of func types/flags. */
-    int def_async_behind;        /* 1 if there was an 'async' token before
-                                    a 'def' token. */
-    int def_in_async;            /* Counter of how deep 'async def's
-                                    are nested.  If greater than 0,
-                                    we are somewhere in an 'async def'
-                                    body, so 'async' and 'await' should
-                                    be parsed as keywords.*/
+    /* async/await related fields; can be removed in 3.7 when async and await
+       become normal keywords. */
+    int async_def;        /* =1 if tokens are inside an 'async def' body. */
+    int async_def_indent; /* Indentation level of the outermost 'async def'. */
+    int async_def_nl;     /* =1 if the outermost 'async def' had at least one
+                             NEWLINE token after it. */
 };
 
 extern struct tok_state *PyTokenizer_FromString(const char *, int);
author	Yury Selivanov <yselivanov@sprymix.com>	2015-07-23 15:01:58 +0300
committer	Yury Selivanov <yselivanov@sprymix.com>	2015-07-23 15:01:58 +0300
commit	96ec934e755355cfc5af036db8641646b7ddb45e (patch)
tree	a6fd6a4cbef1b75ab0cc10db01fd91ecf2e99976
parent	Issue #24687: Plug refleak on SyntaxError in function parameters annotations. (diff)
download	cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.gz cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.bz2 cpython-96ec934e755355cfc5af036db8641646b7ddb45e.zip