From 4fe72090deb7fb7bc09bfa56c92f6b3b0967d395 Mon Sep 17 00:00:00 2001
From: Eric Snow <ericsnowcurrently@gmail.com>
Date: Fri, 30 Oct 2020 15:46:52 -0600
Subject: [PATCH] bpo-36876: Small adjustments to the C-analyzer tool.
 (GH-23045)

This is a little bit of clean-up, small fixes, and additional helpers prior to building an updated & accurate list of globals to eliminate.
---
 Tools/c-analyzer/c_analyzer/__init__.py      |   8 +-
 Tools/c-analyzer/c_analyzer/__main__.py      | 104 +++++----
 Tools/c-analyzer/c_analyzer/analyze.py       |   6 +-
 Tools/c-analyzer/c_analyzer/datafiles.py     |   3 +-
 Tools/c-analyzer/c_analyzer/info.py          |  42 +---
 Tools/c-analyzer/c_analyzer/match.py         | 212 +++++++++++++++++++
 Tools/c-analyzer/c_common/scriptutil.py      |  51 ++++-
 Tools/c-analyzer/c_parser/datafiles.py       |   2 +-
 Tools/c-analyzer/c_parser/info.py            | 166 ++++-----------
 Tools/c-analyzer/c_parser/match.py           | 177 ++++++++++++++++
 Tools/c-analyzer/c_parser/parser/__init__.py |   6 +-
 Tools/c-analyzer/c_parser/parser/_info.py    |  15 ++
 Tools/c-analyzer/c_parser/parser/_regexes.py |   3 +-
 Tools/c-analyzer/cpython/__main__.py         |   5 +-
 Tools/c-analyzer/cpython/_analyzer.py        |   7 +-
 Tools/c-analyzer/cpython/_parser.py          |  44 +++-
 16 files changed, 633 insertions(+), 218 deletions(-)
 create mode 100644 Tools/c-analyzer/c_analyzer/match.py
 create mode 100644 Tools/c-analyzer/c_parser/match.py

diff --git a/Tools/c-analyzer/c_analyzer/__init__.py b/Tools/c-analyzer/c_analyzer/__init__.py
index 4a01cd396f5f5f..171fa25102bffc 100644
--- a/Tools/c-analyzer/c_analyzer/__init__.py
+++ b/Tools/c-analyzer/c_analyzer/__init__.py
@@ -4,10 +4,12 @@
 from c_parser.info import (
     KIND,
     TypeDeclaration,
-    filter_by_kind,
-    collate_by_kind_group,
     resolve_parsed,
 )
+from c_parser.match import (
+    filter_by_kind,
+    group_by_kinds,
+)
 from . import (
     analyze as _analyze,
     datafiles as _datafiles,
@@ -55,7 +57,7 @@ def analyze_decls(decls, known, *,
     )
 
     decls = list(decls)
-    collated = collate_by_kind_group(decls)
+    collated = group_by_kinds(decls)
 
     types = {decl: None for decl in collated['type']}
     typespecs = _analyze.get_typespecs(types)
diff --git a/Tools/c-analyzer/c_analyzer/__main__.py b/Tools/c-analyzer/c_analyzer/__main__.py
index 1fd45b985d9bcf..4cff1d4efb5fe9 100644
--- a/Tools/c-analyzer/c_analyzer/__main__.py
+++ b/Tools/c-analyzer/c_analyzer/__main__.py
@@ -1,5 +1,6 @@
 import io
 import logging
+import os
 import os.path
 import re
 import sys
@@ -9,6 +10,7 @@
     add_verbosity_cli,
     add_traceback_cli,
     add_sepval_cli,
+    add_progress_cli,
     add_files_cli,
     add_commands_cli,
     process_args_by_key,
@@ -17,11 +19,13 @@
     filter_filenames,
     iter_marks,
 )
-from c_parser.info import KIND, is_type_decl
+from c_parser.info import KIND
+from c_parser.match import is_type_decl
+from .match import filter_forward
 from . import (
     analyze as _analyze,
-    check_all as _check_all,
     datafiles as _datafiles,
+    check_all as _check_all,
 )
 
 
@@ -44,7 +48,7 @@
 TABLE_SECTIONS = {
     'types': (
         ['kind', 'name', 'data', 'file'],
-        is_type_decl,
+        KIND.is_type_decl,
         (lambda v: (v.kind.value, v.filename or '', v.name)),
     ),
     'typedefs': 'types',
@@ -167,9 +171,7 @@ def handle_failure(failure, data):
             print(f'{data.filename}:{name} - {failure}')
     elif fmt == 'summary':
         def handle_failure(failure, data):
-            parent = data.parent or ''
-            funcname = parent if isinstance(parent, str) else parent.name
-            print(f'{data.filename:35}\t{funcname or "-":35}\t{data.name:40}\t{failure}')
+            print(_fmt_one_summary(data, failure))
     elif fmt == 'full':
         div = ''
         def handle_failure(failure, data):
@@ -230,6 +232,15 @@ def section(name):
     yield f'grand total: {total}'
 
 
+def _fmt_one_summary(item, extra=None):
+    parent = item.parent or ''
+    funcname = parent if isinstance(parent, str) else parent.name
+    if extra:
+        return f'{item.filename:35}\t{funcname or "-":35}\t{item.name:40}\t{extra}'
+    else:
+        return f'{item.filename:35}\t{funcname or "-":35}\t{item.name}'
+
+
 def fmt_full(analysis):
     # XXX Support sorting.
     items = sorted(analysis, key=lambda v: v.key)
@@ -272,10 +283,12 @@ def process_checks(args):
             args.checks = [check]
     else:
         process_checks = add_checks_cli(parser, checks=checks)
+    process_progress = add_progress_cli(parser)
     process_output = add_output_cli(parser, default=None)
     process_files = add_files_cli(parser, **kwargs)
     return [
         process_checks,
+        process_progress,
         process_output,
         process_files,
     ]
@@ -288,6 +301,7 @@ def cmd_check(filenames, *,
               relroot=None,
               failfast=False,
               iter_filenames=None,
+              track_progress=None,
               verbosity=VERBOSITY,
               _analyze=_analyze,
               _CHECKS=CHECKS,
@@ -304,36 +318,53 @@ def cmd_check(filenames, *,
      ) = _get_check_handlers(fmt, printer, verbosity)
 
     filenames = filter_filenames(filenames, iter_filenames)
+    if track_progress:
+        filenames = track_progress(filenames)
 
-    logger.info('analyzing...')
+    logger.info('analyzing files...')
     analyzed = _analyze(filenames, **kwargs)
     if relroot:
         analyzed.fix_filenames(relroot)
+    decls = filter_forward(analyzed, markpublic=True)
 
-    logger.info('checking...')
-    numfailed = 0
-    for data, failure in _check_all(analyzed, checks, failfast=failfast):
+    logger.info('checking analysis results...')
+    failed = []
+    for data, failure in _check_all(decls, checks, failfast=failfast):
         if data is None:
             printer.info('stopping after one failure')
             break
-        if div is not None and numfailed > 0:
+        if div is not None and len(failed) > 0:
             printer.info(div)
-        numfailed += 1
+        failed.append(data)
         handle_failure(failure, data)
     handle_after()
 
     printer.info('-------------------------')
-    logger.info(f'total failures: {numfailed}')
+    logger.info(f'total failures: {len(failed)}')
     logger.info('done checking')
 
-    if numfailed > 0:
-        sys.exit(numfailed)
+    if fmt == 'summary':
+        print('Categorized by storage:')
+        print()
+        from .match import group_by_storage
+        grouped = group_by_storage(failed, ignore_non_match=False)
+        for group, decls in grouped.items():
+            print()
+            print(group)
+            for decl in decls:
+                print(' ', _fmt_one_summary(decl))
+            print(f'subtotal: {len(decls)}')
+
+    if len(failed) > 0:
+        sys.exit(len(failed))
 
 
 def _cli_analyze(parser, **kwargs):
+    process_progress = add_progress_cli(parser)
     process_output = add_output_cli(parser)
     process_files = add_files_cli(parser, **kwargs)
     return [
+        process_progress,
         process_output,
         process_files,
     ]
@@ -343,6 +374,7 @@ def _cli_analyze(parser, **kwargs):
 def cmd_analyze(filenames, *,
                 fmt=None,
                 iter_filenames=None,
+                track_progress=None,
                 verbosity=None,
                 _analyze=_analyze,
                 formats=FORMATS,
@@ -356,49 +388,46 @@ def cmd_analyze(filenames, *,
         raise ValueError(f'unsupported fmt {fmt!r}')
 
     filenames = filter_filenames(filenames, iter_filenames)
-    if verbosity == 2:
-        def iter_filenames(filenames=filenames):
-            marks = iter_marks()
-            for filename in filenames:
-                print(next(marks), end='')
-                yield filename
-        filenames = iter_filenames()
-    elif verbosity > 2:
-        def iter_filenames(filenames=filenames):
-            for filename in filenames:
-                print(f'<{filename}>')
-                yield filename
-        filenames = iter_filenames()
-
-    logger.info('analyzing...')
+    if track_progress:
+        filenames = track_progress(filenames)
+
+    logger.info('analyzing files...')
     analyzed = _analyze(filenames, **kwargs)
+    decls = filter_forward(analyzed, markpublic=True)
 
-    for line in do_fmt(analyzed):
+    for line in do_fmt(decls):
         print(line)
 
 
 def _cli_data(parser, filenames=None, known=None):
     ArgumentParser = type(parser)
     common = ArgumentParser(add_help=False)
-    if filenames is None:
-        common.add_argument('filenames', metavar='FILE', nargs='+')
+    # These flags will get processed by the top-level parse_args().
+    add_verbosity_cli(common)
+    add_traceback_cli(common)
 
     subs = parser.add_subparsers(dest='datacmd')
 
     sub = subs.add_parser('show', parents=[common])
     if known is None:
         sub.add_argument('--known', required=True)
+    if filenames is None:
+        sub.add_argument('filenames', metavar='FILE', nargs='+')
 
-    sub = subs.add_parser('dump')
+    sub = subs.add_parser('dump', parents=[common])
     if known is None:
         sub.add_argument('--known')
     sub.add_argument('--show', action='store_true')
+    process_progress = add_progress_cli(sub)
 
-    sub = subs.add_parser('check')
+    sub = subs.add_parser('check', parents=[common])
     if known is None:
         sub.add_argument('--known', required=True)
 
-    return None
+    def process_args(args):
+        if args.datacmd == 'dump':
+            process_progress(args)
+    return process_args
 
 
 def cmd_data(datacmd, filenames, known=None, *,
@@ -406,6 +435,7 @@ def cmd_data(datacmd, filenames, known=None, *,
              formats=FORMATS,
              extracolumns=None,
              relroot=None,
+             track_progress=None,
              **kwargs
              ):
     kwargs.pop('verbosity', None)
@@ -417,6 +447,8 @@ def cmd_data(datacmd, filenames, known=None, *,
         for line in do_fmt(known):
             print(line)
     elif datacmd == 'dump':
+        if track_progress:
+            filenames = track_progress(filenames)
         analyzed = _analyze(filenames, **kwargs)
         if known is None or usestdout:
             outfile = io.StringIO()
diff --git a/Tools/c-analyzer/c_analyzer/analyze.py b/Tools/c-analyzer/c_analyzer/analyze.py
index d8ae915e420029..267d058e07abdb 100644
--- a/Tools/c-analyzer/c_analyzer/analyze.py
+++ b/Tools/c-analyzer/c_analyzer/analyze.py
@@ -3,15 +3,19 @@
     TypeDeclaration,
     POTSType,
     FuncPtr,
+)
+from c_parser.match import (
     is_pots,
     is_funcptr,
 )
 from .info import (
     IGNORED,
     UNKNOWN,
-    is_system_type,
     SystemType,
 )
+from .match import (
+    is_system_type,
+)
 
 
 def get_typespecs(typedecls):
diff --git a/Tools/c-analyzer/c_analyzer/datafiles.py b/Tools/c-analyzer/c_analyzer/datafiles.py
index 0de438cce470fd..d37a4eefe351ad 100644
--- a/Tools/c-analyzer/c_analyzer/datafiles.py
+++ b/Tools/c-analyzer/c_analyzer/datafiles.py
@@ -1,5 +1,6 @@
 import c_common.tables as _tables
 import c_parser.info as _info
+import c_parser.match as _match
 import c_parser.datafiles as _parser
 from . import analyze as _analyze
 
@@ -17,7 +18,7 @@ def analyze_known(known, *,
                   handle_unresolved=True,
                   ):
     knowntypes = knowntypespecs = {}
-    collated = _info.collate_by_kind_group(known)
+    collated = _match.group_by_kinds(known)
     types = {decl: None for decl in collated['type']}
     typespecs = _analyze.get_typespecs(types)
     def analyze_decl(decl):
diff --git a/Tools/c-analyzer/c_analyzer/info.py b/Tools/c-analyzer/c_analyzer/info.py
index 23d77611a4c3ca..be9281502d250d 100644
--- a/Tools/c-analyzer/c_analyzer/info.py
+++ b/Tools/c-analyzer/c_analyzer/info.py
@@ -7,7 +7,11 @@
     HighlevelParsedItem,
     Declaration,
     TypeDeclaration,
+)
+from c_parser.match import (
     is_type_decl,
+)
+from .match import (
     is_process_global,
 )
 
@@ -16,44 +20,6 @@
 UNKNOWN = _misc.Labeled('UNKNOWN')
 
 
-# XXX Use known.tsv for these?
-SYSTEM_TYPES = {
-    'int8_t',
-    'uint8_t',
-    'int16_t',
-    'uint16_t',
-    'int32_t',
-    'uint32_t',
-    'int64_t',
-    'uint64_t',
-    'size_t',
-    'ssize_t',
-    'intptr_t',
-    'uintptr_t',
-    'wchar_t',
-    '',
-    # OS-specific
-    'pthread_cond_t',
-    'pthread_mutex_t',
-    'pthread_key_t',
-    'atomic_int',
-    'atomic_uintptr_t',
-    '',
-    # lib-specific
-    'WINDOW',  # curses
-    'XML_LChar',
-    'XML_Size',
-    'XML_Parser',
-    'enum XML_Error',
-    'enum XML_Status',
-    '',
-}
-
-
-def is_system_type(typespec):
-    return typespec in SYSTEM_TYPES
-
-
 class SystemType(TypeDeclaration):
 
     def __init__(self, name):
diff --git a/Tools/c-analyzer/c_analyzer/match.py b/Tools/c-analyzer/c_analyzer/match.py
new file mode 100644
index 00000000000000..5c27e4a224afc8
--- /dev/null
+++ b/Tools/c-analyzer/c_analyzer/match.py
@@ -0,0 +1,212 @@
+import os.path
+
+from c_parser import (
+    info as _info,
+    match as _match,
+)
+
+
+_KIND = _info.KIND
+
+
+# XXX Use known.tsv for these?
+SYSTEM_TYPES = {
+    'int8_t',
+    'uint8_t',
+    'int16_t',
+    'uint16_t',
+    'int32_t',
+    'uint32_t',
+    'int64_t',
+    'uint64_t',
+    'size_t',
+    'ssize_t',
+    'intptr_t',
+    'uintptr_t',
+    'wchar_t',
+    '',
+    # OS-specific
+    'pthread_cond_t',
+    'pthread_mutex_t',
+    'pthread_key_t',
+    'atomic_int',
+    'atomic_uintptr_t',
+    '',
+    # lib-specific
+    'WINDOW',  # curses
+    'XML_LChar',
+    'XML_Size',
+    'XML_Parser',
+    'enum XML_Error',
+    'enum XML_Status',
+    '',
+}
+
+
+def is_system_type(typespec):
+    return typespec in SYSTEM_TYPES
+
+
+##################################
+# decl matchers
+
+def is_public(decl):
+    if not decl.filename.endswith('.h'):
+        return False
+    if 'Include' not in decl.filename.split(os.path.sep):
+        return False
+    return True
+
+
+def is_process_global(vardecl):
+    kind, storage, _, _, _ = _info.get_parsed_vartype(vardecl)
+    if kind is not _KIND.VARIABLE:
+        raise NotImplementedError(vardecl)
+    if 'static' in (storage or ''):
+        return True
+
+    if hasattr(vardecl, 'parent'):
+        parent = vardecl.parent
+    else:
+        parent = vardecl.get('parent')
+    return not parent
+
+
+def is_fixed_type(vardecl):
+    if not vardecl:
+        return None
+    _, _, _, typespec, abstract = _info.get_parsed_vartype(vardecl)
+    if 'typeof' in typespec:
+        raise NotImplementedError(vardecl)
+    elif not abstract:
+        return True
+
+    if '*' not in abstract:
+        # XXX What about []?
+        return True
+    elif _match._is_funcptr(abstract):
+        return True
+    else:
+        for after in abstract.split('*')[1:]:
+            if not after.lstrip().startswith('const'):
+                return False
+        else:
+            return True
+
+
+def is_immutable(vardecl):
+    if not vardecl:
+        return None
+    if not is_fixed_type(vardecl):
+        return False
+    _, _, typequal, _, _ = _info.get_parsed_vartype(vardecl)
+    # If there, it can only be "const" or "volatile".
+    return typequal == 'const'
+
+
+def is_public_api(decl):
+    if not is_public(decl):
+        return False
+    if decl.kind is _KIND.TYPEDEF:
+        return True
+    elif _match.is_type_decl(decl):
+        return not _match.is_forward_decl(decl)
+    else:
+        return _match.is_external_reference(decl)
+
+
+def is_public_declaration(decl):
+    if not is_public(decl):
+        return False
+    if decl.kind is _KIND.TYPEDEF:
+        return True
+    elif _match.is_type_decl(decl):
+        return _match.is_forward_decl(decl)
+    else:
+        return _match.is_external_reference(decl)
+
+
+def is_public_definition(decl):
+    if not is_public(decl):
+        return False
+    if decl.kind is _KIND.TYPEDEF:
+        return True
+    elif _match.is_type_decl(decl):
+        return not _match.is_forward_decl(decl)
+    else:
+        return not _match.is_external_reference(decl)
+
+
+def is_public_impl(decl):
+    if not _KIND.is_decl(decl.kind):
+        return False
+    # See filter_forward() about "is_public".
+    return getattr(decl, 'is_public', False)
+
+
+def is_module_global_decl(decl):
+    if is_public_impl(decl):
+        return False
+    if _match.is_forward_decl(decl):
+        return False
+    return not _match.is_local_var(decl)
+
+
+##################################
+# filtering with matchers
+
+def filter_forward(items, *, markpublic=False):
+    if markpublic:
+        public = set()
+        actual = []
+        for item in items:
+            if is_public_api(item):
+                public.add(item.id)
+            elif not _match.is_forward_decl(item):
+                actual.append(item)
+            else:
+                # non-public duplicate!
+                # XXX
+                raise Exception(item)
+        for item in actual:
+            _info.set_flag(item, 'is_public', item.id in public)
+            yield item
+    else:
+        for item in items:
+            if _match.is_forward_decl(item):
+                continue
+            yield item
+
+
+##################################
+# grouping with matchers
+
+def group_by_storage(decls, **kwargs):
+    def is_module_global(decl):
+        if not is_module_global_decl(decl):
+            return False
+        if decl.kind == _KIND.VARIABLE:
+            if _info.get_effective_storage(decl) == 'static':
+                # This is covered by is_static_module_global().
+                return False
+        return True
+    def is_static_module_global(decl):
+        if not _match.is_global_var(decl):
+            return False
+        return _info.get_effective_storage(decl) == 'static'
+    def is_static_local(decl):
+        if not _match.is_local_var(decl):
+            return False
+        return _info.get_effective_storage(decl) == 'static'
+    #def is_local(decl):
+    #    if not _match.is_local_var(decl):
+    #        return False
+    #    return _info.get_effective_storage(decl) != 'static'
+    categories = {
+        #'extern': is_extern,
+        'published': is_public_impl,
+        'module-global': is_module_global,
+        'static-module-global': is_static_module_global,
+        'static-local': is_static_local,
+    }
+    return _match.group_by_category(decls, categories, **kwargs)
diff --git a/Tools/c-analyzer/c_common/scriptutil.py b/Tools/c-analyzer/c_common/scriptutil.py
index 939a85003b2964..222059015d76ec 100644
--- a/Tools/c-analyzer/c_common/scriptutil.py
+++ b/Tools/c-analyzer/c_common/scriptutil.py
@@ -10,6 +10,9 @@
 from . import fsutil, strutil, iterutil, logging as loggingutil
 
 
+_NOT_SET = object()
+
+
 def get_prog(spec=None, *, absolute=False, allowsuffix=True):
     if spec is None:
         _, spec = _find_script()
@@ -313,6 +316,22 @@ def _parse_files(filenames):
         yield filename.strip()
 
 
+def add_progress_cli(parser, *, threshold=VERBOSITY, **kwargs):
+    parser.add_argument('--progress', dest='track_progress', action='store_const', const=True)
+    parser.add_argument('--no-progress', dest='track_progress', action='store_false')
+    parser.set_defaults(track_progress=True)
+
+    def process_args(args):
+        if args.track_progress:
+            ns = vars(args)
+            verbosity = ns.get('verbosity', VERBOSITY)
+            if verbosity <= threshold:
+                args.track_progress = track_progress_compact
+            else:
+                args.track_progress = track_progress_flat
+    return process_args
+
+
 def add_failure_filtering_cli(parser, pool, *, default=False):
     parser.add_argument('--fail', action='append',
                         metavar=f'"{{all|{"|".join(sorted(pool))}}},..."')
@@ -551,13 +570,39 @@ def _iter_filenames(filenames, iter_files):
         raise NotImplementedError
 
 
-def iter_marks(mark='.', *, group=5, groups=2, lines=10, sep=' '):
+def track_progress_compact(items, *, groups=5, **mark_kwargs):
+    last = os.linesep
+    marks = iter_marks(groups=groups, **mark_kwargs)
+    for item in items:
+        last = next(marks)
+        print(last, end='', flush=True)
+        yield item
+    if not last.endswith(os.linesep):
+        print()
+
+
+def track_progress_flat(items, fmt='<{}>'):
+    for item in items:
+        print(fmt.format(item), flush=True)
+        yield item
+
+
+def iter_marks(mark='.', *, group=5, groups=2, lines=_NOT_SET, sep=' '):
     mark = mark or ''
+    group = group if group and group > 1 else 1
+    groups = groups if groups and groups > 1 else 1
+
     sep = f'{mark}{sep}' if sep else mark
     end = f'{mark}{os.linesep}'
     div = os.linesep
     perline = group * groups
-    perlines = perline * lines
+    if lines is _NOT_SET:
+        # By default we try to put about 100 in each line group.
+        perlines = 100 // perline * perline
+    elif not lines or lines < 0:
+        perlines = None
+    else:
+        perlines = perline * lines
 
     if perline == 1:
         yield end
@@ -568,7 +613,7 @@ def iter_marks(mark='.', *, group=5, groups=2, lines=10, sep=' '):
     while True:
         if count % perline == 0:
             yield end
-            if count % perlines == 0:
+            if perlines and count % perlines == 0:
                 yield div
         elif count % group == 0:
             yield sep
diff --git a/Tools/c-analyzer/c_parser/datafiles.py b/Tools/c-analyzer/c_parser/datafiles.py
index 5bdb946b1772ab..cdd69b1f9b2d8a 100644
--- a/Tools/c-analyzer/c_parser/datafiles.py
+++ b/Tools/c-analyzer/c_parser/datafiles.py
@@ -92,7 +92,7 @@ def write_decls_tsv(decls, outfile, extracolumns=None, *,
                     **kwargs
                     ):
     # XXX Move the row rendering here.
-    _write_decls_tsv(rows, outfile, extracolumns, relroot, kwargs)
+    _write_decls_tsv(decls, outfile, extracolumns, relroot, kwargs)
 
 
 def _iter_decls_tsv(infile, extracolumns=None, relroot=None):
diff --git a/Tools/c-analyzer/c_parser/info.py b/Tools/c-analyzer/c_parser/info.py
index a07ce2e0ccb8d3..798a45d2e08e71 100644
--- a/Tools/c-analyzer/c_parser/info.py
+++ b/Tools/c-analyzer/c_parser/info.py
@@ -7,85 +7,12 @@
 import c_common.misc as _misc
 import c_common.strutil as _strutil
 import c_common.tables as _tables
-from .parser._regexes import SIMPLE_TYPE
+from .parser._regexes import SIMPLE_TYPE, _STORAGE
 
 
 FIXED_TYPE = _misc.Labeled('FIXED_TYPE')
 
-POTS_REGEX = re.compile(rf'^{SIMPLE_TYPE}$', re.VERBOSE)
-
-
-def is_pots(typespec):
-    if not typespec:
-        return None
-    if type(typespec) is not str:
-        _, _, _, typespec, _ = get_parsed_vartype(typespec)
-    return POTS_REGEX.match(typespec) is not None
-
-
-def is_funcptr(vartype):
-    if not vartype:
-        return None
-    _, _, _, _, abstract = get_parsed_vartype(vartype)
-    return _is_funcptr(abstract)
-
-
-def _is_funcptr(declstr):
-    if not declstr:
-        return None
-    # XXX Support "(<name>*)(".
-    return '(*)(' in declstr.replace(' ', '')
-
-
-def is_exported_symbol(decl):
-    _, storage, _, _, _ = get_parsed_vartype(decl)
-    raise NotImplementedError
-
-
-def is_process_global(vardecl):
-    kind, storage, _, _, _ = get_parsed_vartype(vardecl)
-    if kind is not KIND.VARIABLE:
-        raise NotImplementedError(vardecl)
-    if 'static' in (storage or ''):
-        return True
-
-    if hasattr(vardecl, 'parent'):
-        parent = vardecl.parent
-    else:
-        parent = vardecl.get('parent')
-    return not parent
-
-
-def is_fixed_type(vardecl):
-    if not vardecl:
-        return None
-    _, _, _, typespec, abstract = get_parsed_vartype(vardecl)
-    if 'typeof' in typespec:
-        raise NotImplementedError(vardecl)
-    elif not abstract:
-        return True
-
-    if '*' not in abstract:
-        # XXX What about []?
-        return True
-    elif _is_funcptr(abstract):
-        return True
-    else:
-        for after in abstract.split('*')[1:]:
-            if not after.lstrip().startswith('const'):
-                return False
-        else:
-            return True
-
-
-def is_immutable(vardecl):
-    if not vardecl:
-        return None
-    if not is_fixed_type(vardecl):
-        return False
-    _, _, typequal, _, _ = get_parsed_vartype(vardecl)
-    # If there, it can only be "const" or "volatile".
-    return typequal == 'const'
+STORAGE = frozenset(_STORAGE)
 
 
 #############################
@@ -214,58 +141,8 @@ def resolve_group(cls, group):
 KIND._GROUPS.update((k.value, {k}) for k in KIND)
 
 
-# The module-level kind-related helpers (below) deal with <item>.kind:
-
-def is_type_decl(kind):
-    # Handle ParsedItem, Declaration, etc..
-    kind = getattr(kind, 'kind', kind)
-    return KIND.is_type_decl(kind)
-
-
-def is_decl(kind):
-    # Handle ParsedItem, Declaration, etc..
-    kind = getattr(kind, 'kind', kind)
-    return KIND.is_decl(kind)
-
-
-def filter_by_kind(items, kind):
-    if kind == 'type':
-        kinds = KIND._TYPE_DECLS
-    elif kind == 'decl':
-        kinds = KIND._TYPE_DECLS
-    try:
-        okay = kind in KIND
-    except TypeError:
-        kinds = set(kind)
-    else:
-        kinds = {kind} if okay else set(kind)
-    for item in items:
-        if item.kind in kinds:
-            yield item
-
-
-def collate_by_kind(items):
-    collated = {kind: [] for kind in KIND}
-    for item in items:
-        try:
-            collated[item.kind].append(item)
-        except KeyError:
-            raise ValueError(f'unsupported kind in {item!r}')
-    return collated
-
-
-def get_kind_group(kind):
-    # Handle ParsedItem, Declaration, etc..
-    kind = getattr(kind, 'kind', kind)
-    return KIND.get_group(kind)
-
-
-def collate_by_kind_group(items):
-    collated = {KIND.get_group(k): [] for k in KIND}
-    for item in items:
-        group = KIND.get_group(item.kind)
-        collated[group].append(item)
-    return collated
+def get_kind_group(item):
+    return KIND.get_group(item.kind)
 
 
 #############################
@@ -484,6 +361,27 @@ def get_parsed_vartype(decl):
     return kind, storage, typequal, typespec, abstract
 
 
+def get_default_storage(decl):
+    if decl.kind not in (KIND.VARIABLE, KIND.FUNCTION):
+        return None
+    return 'extern' if decl.parent is None else 'auto'
+
+
+def get_effective_storage(decl, *, default=None):
+    # Note that "static" limits access to just that C module
+    # and "extern" (the default for module-level) allows access
+    # outside the C module.
+    if default is None:
+        default = get_default_storage(decl)
+        if default is None:
+            return None
+    try:
+        storage = decl.storage
+    except AttributeError:
+        storage, _ = _get_vartype(decl.data)
+    return storage or default
+
+
 #############################
 # high-level
 
@@ -997,7 +895,7 @@ def _unformat_data(cls, datastr, fmt=None):
 
     def __init__(self, file, name, data, parent=None, storage=None):
         super().__init__(file, name, data, parent,
-                         _extra={'storage': storage},
+                         _extra={'storage': storage or None},
                          _shortkey=f'({parent.name}).{name}' if parent else name,
                          _key=(str(file),
                                # Tilde comes after all other ascii characters.
@@ -1005,6 +903,11 @@ def __init__(self, file, name, data, parent=None, storage=None):
                                name,
                                ),
                          )
+        if storage:
+            if storage not in STORAGE:
+                # The parser must need an update.
+                raise NotImplementedError(storage)
+            # Otherwise we trust the compiler to have validated it.
 
     @property
     def vartype(self):
@@ -1413,6 +1316,13 @@ def resolve_parsed(parsed):
     return cls.from_parsed(parsed)
 
 
+def set_flag(item, name, value):
+    try:
+        setattr(item, name, value)
+    except AttributeError:
+        object.__setattr__(item, name, value)
+
+
 #############################
 # composite
 
diff --git a/Tools/c-analyzer/c_parser/match.py b/Tools/c-analyzer/c_parser/match.py
new file mode 100644
index 00000000000000..3b5068fd11b685
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/match.py
@@ -0,0 +1,177 @@
+import re
+
+from . import info as _info
+from .parser._regexes import SIMPLE_TYPE
+
+
+_KIND = _info.KIND
+
+
+def match_storage(decl, expected):
+    default = _info.get_default_storage(decl)
+    #assert default
+    if expected is None:
+        expected = {default}
+    elif isinstance(expected, str):
+        expected = {expected or default}
+    elif not expected:
+        expected = _info.STORAGE
+    else:
+        expected = {v or default for v in expected}
+    storage = _info.get_effective_storage(decl, default=default)
+    return storage in expected
+
+
+##################################
+# decl matchers
+
+def is_type_decl(item):
+    return _KIND.is_type_decl(item.kind)
+
+
+def is_decl(item):
+    return _KIND.is_decl(item.kind)
+
+
+def is_pots(typespec, *,
+            _regex=re.compile(rf'^{SIMPLE_TYPE}$', re.VERBOSE),
+            ):
+
+    if not typespec:
+        return None
+    if type(typespec) is not str:
+        _, _, _, typespec, _ = _info.get_parsed_vartype(typespec)
+    return _regex.match(typespec) is not None
+
+
+def is_funcptr(vartype):
+    if not vartype:
+        return None
+    _, _, _, _, abstract = _info.get_parsed_vartype(vartype)
+    return _is_funcptr(abstract)
+
+
+def _is_funcptr(declstr):
+    if not declstr:
+        return None
+    # XXX Support "(<name>*)(".
+    return '(*)(' in declstr.replace(' ', '')
+
+
+def is_forward_decl(decl):
+    if decl.kind is _KIND.TYPEDEF:
+        return False
+    elif is_type_decl(decl):
+        return not decl.data
+    elif decl.kind is _KIND.FUNCTION:
+        # XXX This doesn't work with ParsedItem.
+        return decl.signature.isforward
+    elif decl.kind is _KIND.VARIABLE:
+        # No var decls are considered forward (or all are...).
+        return False
+    else:
+        raise NotImplementedError(decl)
+
+
+def can_have_symbol(decl):
+    return decl.kind in (_KIND.VARIABLE, _KIND.FUNCTION)
+
+
+def has_external_symbol(decl):
+    if not can_have_symbol(decl):
+        return False
+    if _info.get_effective_storage(decl) != 'extern':
+        return False
+    if decl.kind is _KIND.FUNCTION:
+        return not decl.signature.isforward
+    else:
+        # It must be a variable, which can only be implicitly extern here.
+        return decl.storage != 'extern'
+
+
+def has_internal_symbol(decl):
+    if not can_have_symbol(decl):
+        return False
+    return _info.get_actual_storage(decl) == 'static'
+
+
+def is_external_reference(decl):
+    if not can_have_symbol(decl):
+        return False
+    # We have to check the declared storage rather tnan the effective.
+    if decl.storage != 'extern':
+        return False
+    if decl.kind is _KIND.FUNCTION:
+        return decl.signature.isforward
+    # Otherwise it's a variable.
+    return True
+
+
+def is_local_var(decl):
+    if not decl.kind is _KIND.VARIABLE:
+        return False
+    return True if decl.parent else False
+
+
+def is_global_var(decl):
+    if not decl.kind is _KIND.VARIABLE:
+        return False
+    return False if decl.parent else True
+
+
+##################################
+# filtering with matchers
+
+def filter_by_kind(items, kind):
+    if kind == 'type':
+        kinds = _KIND._TYPE_DECLS
+    elif kind == 'decl':
+        kinds = _KIND._TYPE_DECLS
+    try:
+        okay = kind in _KIND
+    except TypeError:
+        kinds = set(kind)
+    else:
+        kinds = {kind} if okay else set(kind)
+    for item in items:
+        if item.kind in kinds:
+            yield item
+
+
+##################################
+# grouping with matchers
+
+def group_by_category(decls, categories, *, ignore_non_match=True):
+    collated = {}
+    for decl in decls:
+        # Matchers should be mutually exclusive.  (First match wins.)
+        for category, match in categories.items():
+            if match(decl):
+                if category not in collated:
+                    collated[category] = [decl]
+                else:
+                    collated[category].append(decl)
+                break
+        else:
+            if not ignore_non_match:
+                raise Exception(f'no match for {decl!r}')
+    return collated
+
+
+def group_by_kind(items):
+    collated = {kind: [] for kind in _KIND}
+    for item in items:
+        try:
+            collated[item.kind].append(item)
+        except KeyError:
+            raise ValueError(f'unsupported kind in {item!r}')
+    return collated
+
+
+def group_by_kinds(items):
+    # Collate into kind groups (decl, type, etc.).
+    collated = {_KIND.get_group(k): [] for k in _KIND}
+    for item in items:
+        group = _KIND.get_group(item.kind)
+        collated[group].append(item)
+    return collated
diff --git a/Tools/c-analyzer/c_parser/parser/__init__.py b/Tools/c-analyzer/c_parser/parser/__init__.py
index 7cb34caf09eba8..4b201c6354023c 100644
--- a/Tools/c-analyzer/c_parser/parser/__init__.py
+++ b/Tools/c-analyzer/c_parser/parser/__init__.py
@@ -163,6 +163,8 @@ def _parse(srclines, anon_name):
 
 
 def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False):
+    maxtext = maxtext if maxtext and maxtext > 0 else None
+    maxlines = maxlines if maxlines and maxlines > 0 else None
     filestack = []
     allinfo = {}
     # "lines" should be (fileinfo, data), as produced by the preprocessor code.
@@ -181,9 +183,7 @@ def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False):
 
         _logger.debug(f'-> {line}')
         srcinfo._add_line(line, fileinfo.lno)
-        if len(srcinfo.text) > maxtext:
-            break
-        if srcinfo.end - srcinfo.start > maxlines:
+        if srcinfo.too_much(maxtext, maxlines):
             break
         while srcinfo._used():
             yield srcinfo
diff --git a/Tools/c-analyzer/c_parser/parser/_info.py b/Tools/c-analyzer/c_parser/parser/_info.py
index 2dcd5e5e760b7c..cc21931b66cc57 100644
--- a/Tools/c-analyzer/c_parser/parser/_info.py
+++ b/Tools/c-analyzer/c_parser/parser/_info.py
@@ -1,3 +1,5 @@
+import re
+
 from ..info import KIND, ParsedItem, FileInfo
 
 
@@ -121,6 +123,19 @@ def resolve(self, kind, data, name, parent=None):
     def done(self):
         self._set_ready()
 
+    def too_much(self, maxtext, maxlines):
+        if maxtext and len(self.text) > maxtext:
+            pass
+        elif maxlines and self.end - self.start > maxlines:
+            pass
+        else:
+            return False
+
+        #if re.fullmatch(r'[^;]+\[\][ ]*=[ ]*[{]([ ]*\d+,)*([ ]*\d+,?)\s*',
+        #                self._current.text):
+        #    return False
+        return True
+
     def _set_ready(self):
         if self._current is None:
             self._ready = False
diff --git a/Tools/c-analyzer/c_parser/parser/_regexes.py b/Tools/c-analyzer/c_parser/parser/_regexes.py
index e9bc31d335a7d5..cb85a59aaa16c2 100644
--- a/Tools/c-analyzer/c_parser/parser/_regexes.py
+++ b/Tools/c-analyzer/c_parser/parser/_regexes.py
@@ -137,7 +137,8 @@ def _ind(text, level=1, edges='both'):
 #######################################
 # variable declarations
 
-STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
+_STORAGE = 'auto register static extern'.split()
+STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
 TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
 PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
 
diff --git a/Tools/c-analyzer/cpython/__main__.py b/Tools/c-analyzer/cpython/__main__.py
index 23a3de06f639c1..23ce29776ca68e 100644
--- a/Tools/c-analyzer/cpython/__main__.py
+++ b/Tools/c-analyzer/cpython/__main__.py
@@ -31,6 +31,9 @@ def _resolve_filenames(filenames):
     return resolved
 
 
+#######################################
+# the formats
+
 def fmt_summary(analysis):
     # XXX Support sorting and grouping.
     supported = []
@@ -179,7 +182,7 @@ def analyze(files, **kwargs):
                 analyze_resolved=_analyzer.analyze_resolved,
             )
             return _analyzer.Analysis.from_results(results)
-    else:
+    else:  # check
         known = _analyzer.read_known()
         def analyze(files, **kwargs):
             return _analyzer.iter_decls(files, **kwargs)
diff --git a/Tools/c-analyzer/cpython/_analyzer.py b/Tools/c-analyzer/cpython/_analyzer.py
index 98f8888651e579..978831d1fd9496 100644
--- a/Tools/c-analyzer/cpython/_analyzer.py
+++ b/Tools/c-analyzer/cpython/_analyzer.py
@@ -11,9 +11,14 @@
     Struct,
     Member,
     FIXED_TYPE,
+)
+from c_parser.match import (
     is_type_decl,
     is_pots,
     is_funcptr,
+)
+from c_analyzer.match import (
+    is_system_type,
     is_process_global,
     is_fixed_type,
     is_immutable,
@@ -246,7 +251,7 @@ def _check_typespec(decl, typedecl, types, knowntypes):
     # Fall back to default known types.
     if is_pots(typespec):
         return None
-    elif _info.is_system_type(typespec):
+    elif is_system_type(typespec):
         return None
     elif is_funcptr(decl.vartype):
         return None
diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py
index 35fa296251e2ee..7c8c2966653989 100644
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -46,10 +46,14 @@ def clean_lines(text):
 GLOBS = [
     'Include/*.h',
     'Include/internal/*.h',
+    'Modules/**/*.h',
     'Modules/**/*.c',
+    'Objects/**/*.h',
     'Objects/**/*.c',
+    'Python/**/*.h',
+    'Parser/**/*.c',
+    'Python/**/*.h',
     'Parser/**/*.c',
-    'Python/**/*.c',
 ]
 
 EXCLUDED = clean_lines('''
@@ -67,11 +71,24 @@ def clean_lines(text):
 Modules/_winapi.c               # windows.h
 Modules/overlapped.c            # winsock.h
 Python/dynload_win.c            # windows.h
+Modules/expat/winconfig.h
+Python/thread_nt.h
 
 # other OS-dependent
 Python/dynload_dl.c             # dl.h
 Python/dynload_hpux.c           # dl.h
 Python/dynload_aix.c            # sys/ldr.h
+Python/thread_pthread.h
+
+# only huge constants (safe but parsing is slow)
+Modules/_ssl_data.h
+Modules/unicodedata_db.h
+Modules/unicodename_db.h
+Modules/cjkcodecs/mappings_*.h
+Objects/unicodetype_db.h
+Python/importlib.h
+Python/importlib_external.h
+Python/importlib_zipimport.h
 
 # @end=conf@
 ''')
@@ -80,6 +97,17 @@ def clean_lines(text):
 EXCLUDED += clean_lines('''
 # The tool should be able to parse these...
 
+Modules/hashlib.h
+Objects/stringlib/codecs.h
+Objects/stringlib/count.h
+Objects/stringlib/ctype.h
+Objects/stringlib/fastsearch.h
+Objects/stringlib/find.h
+Objects/stringlib/find_max_char.h
+Objects/stringlib/partition.h
+Objects/stringlib/replace.h
+Objects/stringlib/split.h
+
 Modules/_dbmmodule.c
 Modules/cjkcodecs/_codecs_*.c
 Modules/expat/xmlrole.c
@@ -134,6 +162,9 @@ def clean_lines(text):
 Modules/_ctypes/cfield.c	Py_BUILD_CORE	1
 Modules/_heapqmodule.c	Py_BUILD_CORE	1
 Modules/_posixsubprocess.c	Py_BUILD_CORE	1
+Objects/stringlib/codecs.h	Py_BUILD_CORE	1
+Python/ceval_gil.h	Py_BUILD_CORE	1
+Python/condvar.h	Py_BUILD_CORE	1
 
 Modules/_json.c	Py_BUILD_CORE_BUILTIN	1
 Modules/_pickle.c	Py_BUILD_CORE_BUILTIN	1
@@ -177,6 +208,12 @@ def clean_lines(text):
 Python/import.c	PyMODINIT_FUNC	PyObject*
 Modules/_testcapimodule.c	PyAPI_FUNC(RTYPE)	RTYPE
 Python/getargs.c	PyAPI_FUNC(RTYPE)	RTYPE
+Objects/stringlib/unicode_format.h	Py_LOCAL_INLINE(type)	static inline type
+
+# implied include of pymacro.h
+*/clinic/*.c.h	PyDoc_VAR(name)	static const char name[]
+*/clinic/*.c.h	PyDoc_STR(str)	str
+*/clinic/*.c.h	PyDoc_STRVAR(name,str)	PyDoc_VAR(name) = PyDoc_STR(str)
 
 # implied include of exports.h
 #Modules/_io/bytesio.c	Py_EXPORTED_SYMBOL	/* */
@@ -212,6 +249,11 @@ def clean_lines(text):
 Modules/expat/xmlparse.c	XML_POOR_ENTROPY	1
 Modules/_dbmmodule.c	HAVE_GDBM_DASH_NDBM_H	1
 
+# others
+Modules/sre_lib.h	LOCAL(type)	static inline type
+Modules/sre_lib.h	SRE(F)	sre_ucs2_##F
+Objects/stringlib/codecs.h	STRINGLIB_IS_UNICODE	1
+
 # @end=tsv@
 ''')[1:]