Simplify and tighten license and documentation file name matching

[\.] is the same as [.], and [-] matches the dash, no need for backclash-escaping.
Also, let's shorten the patches by using alternatives for the common parts.

Before we would match any prefix, even though the matches were clearly intended
to cover the whole file name. Let's use fullmatch to make it clear that the whole
string must be matched.
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2021-12-11 12:46:27 +01:00
parent 8e8ae0c969
commit e9a84e8261

View file

@ -37,15 +37,14 @@ JINJA_ENV = jinja2.Environment(
extensions=["jinja2.ext.do"], extensions=["jinja2.ext.do"],
trim_blocks=True, trim_blocks=True,
lstrip_blocks=True) lstrip_blocks=True)
LICENSES = re.compile( LICENSES = re.compile(r"""
r"(COPYING|COPYING[\.\-].*|COPYRIGHT|COPYRIGHT[\.\-].*|" COPYING(?:[.-].*)?|COPYRIGHT(?:[.-].*)?|
r"EULA|EULA[\.\-].*|[Ll]icen[cs]e|[Ll]icen[cs]e.*|LICEN[CS]E|" EULA(?:[.-].*)?|[Ll]icen[cs]e|[Ll]icen[cs]e.*|
r"LICEN[CS]E[\.\-].*|.*[\.\-]LICEN[CS]E.*|NOTICE|NOTICE[\.\-].*|" (?:.*[.-])?(?:UN)?LICEN[CS]E(?:[.-].*)?|NOTICE(?:[.-].*)?|
r"PATENTS|PATENTS[\.\-].*|UNLICEN[CS]E|UNLICEN[CS]E[\.\-].*|" PATENTS(?:[.-].*)?|
r"agpl[\.\-].*|gpl[\.\-].*|lgpl[\.\-].*|AGPL-.*[0-9].*|" (?:agpl|l?gpl)[.-].*|CC-BY-.*|
r"APACHE-.*[0-9].*|BSD-.*[0-9].*|CC-BY-.*|GFDL-.*[0-9].*|" (?:AGPL|APACHE|BSD|GFDL|GNU|L?GPL|MIT|MPL|OFL)-.*[0-9].*
r"GNU-.*[0-9].*|GPL-.*[0-9].*|LGPL-.*[0-9].*|MIT-.*[0-9].*|" """, re.VERBOSE)
r"MPL-.*[0-9].*|OFL-.*[0-9].*)")
def sortify(func): def sortify(func):
"""Return a sorted list from a generator""" """Return a sorted list from a generator"""
@ -236,16 +235,18 @@ def get_license_files(path):
@sortify @sortify
def get_doc_files(path): def get_doc_files(path):
matcher = re.compile( plus = re.compile(r"""
r"(.*\.md|.*\.markdown|.*\.mdown|.*\.mkdn|.*\.rst|.*\.txt|AUTHORS|" .*\.(?:md|markdown|mdown|mkdn|rst|txt)|AUTHORS|
r"AUTHORS[\.\-].*|CONTRIBUTORS|CONTRIBUTORS[\.\-].*|README|" AUTHORS[.-].*|CONTRIBUTORS|CONTRIBUTORS[.-].*|README|
r"README[\.\-].*|CHANGELOG|CHANGELOG[\.\-].*|TODO|TODO[\.\-].*)", README[.-].*|CHANGELOG|CHANGELOG[.-].*|TODO|TODO[.-].*
re.IGNORECASE) """,
matcherex = re.compile(r"CMakeLists\.txt") re.IGNORECASE | re.VERBOSE)
minus = re.compile(r"CMakeLists\.txt")
for root, dirs, files in os.walk(path, topdown=True): for root, dirs, files in os.walk(path, topdown=True):
dirs[:] = [] dirs[:] = []
for f in files: for f in files:
if matcher.match(f) and not LICENSES.match(f) and not matcherex.match(f): if plus.fullmatch(f) and not LICENSES.fullmatch(f) and not minus.fullmatch(f):
yield os.path.relpath(os.path.join(root, f), path) yield os.path.relpath(os.path.join(root, f), path)
def get_package_info(package): def get_package_info(package):