diff --git a/Dockerfile b/Dockerfile
index d4c641d7a2f..4a3ff20516e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,9 +38,11 @@ WORKDIR /scancode-toolkit
COPY . /scancode-toolkit
# Initial configuration using ./configure, scancode-reindex-licenses to build
-# the base license index
+# the base license index and scancode-reindex-package-patterns to build the
+# package patterns cache
RUN ./configure \
- && ./venv/bin/scancode-reindex-licenses
+ && ./venv/bin/scancode-reindex-licenses \
+ && ./venv/bin/scancode-reindex-package-patterns
# Add scancode to path
ENV PATH=/scancode-toolkit:$PATH
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index b7fb0baac4a..2dc0bc4acc5 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -145,14 +145,6 @@ jobs:
test_suites:
all: venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py --reruns 2
- - template: etc/ci/azure-posix.yml
- parameters:
- job_name: macos13_cpython
- image_name: macOS-13
- python_versions: ['3.10', '3.11', '3.12', '3.13']
- test_suites:
- all: venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py --reruns 2
-
- template: etc/ci/azure-win.yml
parameters:
job_name: win2025_cpython
@@ -220,14 +212,6 @@ jobs:
test_suites:
all: venv/bin/pip install --upgrade-strategy eager --force-reinstall --upgrade -e .[testing] && venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py
- - template: etc/ci/azure-posix.yml
- parameters:
- job_name: macos13_cpython_latest_from_pip
- image_name: macos-13
- python_versions: ['3.10', '3.11', '3.12', '3.13']
- test_suites:
- all: venv/bin/pip install --upgrade-strategy eager --force-reinstall --upgrade -e .[testing] && venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py
-
- template: etc/ci/azure-win.yml
parameters:
job_name: win2019_cpython_latest_from_pip
diff --git a/docs/source/rst-snippets/cli-basic-options.rst b/docs/source/rst-snippets/cli-basic-options.rst
index ed6777ea9be..58a8d51fca3 100644
--- a/docs/source/rst-snippets/cli-basic-options.rst
+++ b/docs/source/rst-snippets/cli-basic-options.rst
@@ -33,6 +33,12 @@ documenting a program's options. For example:
--system-package Scan ```` for installed system package
databases.
+--package-in-compiled Scan compiled executable binaries such as ELF,
+ WinpE and Mach-O files, looking for structured
+ package and dependency metadata. Note that looking for
+ packages in binaries makes package scan slower.
+ Currently supported compiled binaries: Go, Rust.
+
--package-only Faster package scan, scanning ```` for
system and application packages, only for package
metadata. This option is skipping
diff --git a/etc/release/scancode-create-pypi-wheel.sh b/etc/release/scancode-create-pypi-wheel.sh
index 5ab2fe8e988..4c27868c9cf 100755
--- a/etc/release/scancode-create-pypi-wheel.sh
+++ b/etc/release/scancode-create-pypi-wheel.sh
@@ -19,6 +19,7 @@ set -e
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-reindex-package-patterns
python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" )
diff --git a/etc/release/scancode-create-release-app-linux.sh b/etc/release/scancode-create-release-app-linux.sh
index fbe5951a937..93fb37dc0ce 100755
--- a/etc/release/scancode-create-release-app-linux.sh
+++ b/etc/release/scancode-create-release-app-linux.sh
@@ -65,6 +65,7 @@ cp -r etc/thirdparty $release_dir/etc
# Build the wheel
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-reindex-package-patterns
venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
cp -r \
diff --git a/etc/release/scancode-create-release-app-macos.sh b/etc/release/scancode-create-release-app-macos.sh
index 5f34bf88f28..7bcd8b7b270 100755
--- a/etc/release/scancode-create-release-app-macos.sh
+++ b/etc/release/scancode-create-release-app-macos.sh
@@ -63,6 +63,7 @@ cp -r etc/thirdparty $release_dir/etc
# Build the wheel
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-reindex-package-patterns
venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
cp -r \
diff --git a/etc/release/scancode-create-release-app-windows.sh b/etc/release/scancode-create-release-app-windows.sh
index 03a22d7117a..7a8b8ab87d8 100755
--- a/etc/release/scancode-create-release-app-windows.sh
+++ b/etc/release/scancode-create-release-app-windows.sh
@@ -62,6 +62,7 @@ cp -r etc/thirdparty $release_dir/etc
# Build the wheel
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-reindex-package-patterns
venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
cp -r \
diff --git a/requirements.txt b/requirements.txt
index 4cdb3fdebce..e7d5c43e483 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -40,6 +40,7 @@ license-expression==30.4.4
lxml==5.4.0
MarkupSafe==3.0.2
more-itertools==10.7.0
+multiregex==2.0.3
normality==2.6.1
packageurl-python==0.17.1
packaging==25.0
diff --git a/setup-mini.cfg b/setup-mini.cfg
index 2b7196b4a90..7e419713343 100644
--- a/setup-mini.cfg
+++ b/setup-mini.cfg
@@ -89,6 +89,7 @@ install_requires =
license_expression >= 30.4.4
lxml >= 5.4.0
MarkupSafe >= 2.1.2
+ multiregex >= 2.0.3
normality <= 2.6.1
packageurl_python >= 0.9.0
packvers >= 21.0.0
@@ -156,6 +157,7 @@ packages =
console_scripts =
scancode = scancode.cli:scancode
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
+ scancode-reindex-package-patterns = packagedcode.cache:cache_package_patterns
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
add-required-phrases = licensedcode.required_phrases:add_required_phrases
diff --git a/setup.cfg b/setup.cfg
index c2fa168ed1d..a3c5cc95b8a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -74,6 +74,7 @@ install_requires =
colorama >= 0.3.9
commoncode >= 32.4.0
container-inspector >= 31.0.0
+ cyseq >= 0.0.2
debian-inspector >= 31.1.0
dparse2 >= 0.7.0
fasteners
@@ -90,6 +91,7 @@ install_requires =
license_expression >= 30.4.4
lxml >= 5.4.0
MarkupSafe >= 2.1.2
+ multiregex >= 2.0.3
normality <= 2.6.1
packageurl_python >= 0.9.0
packvers >= 21.0.0
@@ -116,7 +118,6 @@ install_requires =
typecode >= 30.0.1
typecode[full] >= 30.0.1
extractcode[full] >= 31.0.0
- cyseq >= 0.0.2
[options.packages.find]
@@ -158,6 +159,7 @@ packages =
console_scripts =
scancode = scancode.cli:scancode
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
+ scancode-reindex-package-patterns = packagedcode.cache:cache_package_patterns
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
add-required-phrases = licensedcode.required_phrases:add_required_phrases
diff --git a/src/licensedcode/cache.py b/src/licensedcode/cache.py
index 92998a2bb41..65b0fe59ce1 100644
--- a/src/licensedcode/cache.py
+++ b/src/licensedcode/cache.py
@@ -34,7 +34,6 @@
LICENSE_INDEX_DIR = 'license_index'
LICENSE_INDEX_FILENAME = 'index_cache'
LICENSE_LOCKFILE_NAME = 'scancode_license_index_lockfile'
-LICENSE_CHECKSUM_FILE = 'scancode_license_index_tree_checksums'
class LicenseCache:
diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py
index 5c42f96760a..717253c4baa 100644
--- a/src/licensedcode/plugin_license.py
+++ b/src/licensedcode/plugin_license.py
@@ -152,6 +152,9 @@ def setup(self, **kwargs):
This is a cache warmup such that child process inherit from the
loaded index.
"""
+ if kwargs.get("package_only"):
+ return
+
from licensedcode.cache import populate_cache
populate_cache()
diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py
index 9cc46d0e09b..d3c48b6e259 100644
--- a/src/packagedcode/__init__.py
+++ b/src/packagedcode/__init__.py
@@ -246,15 +246,24 @@
win_reg.InstalledProgramFromDockerUtilityvmSoftwareHandler,
]
+
+# These handlers are special as they use filetype to
+# detect these compiled binaries instead of datafile path patterns
+# as these are optionally installed, we can skip checking
+# for filetype if these are not available
+PACKAGE_IN_COMPILED_DATAFILE_HANDLERS = []
+
try:
from go_inspector.binary import get_go_binary_handler
- APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_go_binary_handler())
+ handler = get_go_binary_handler()
+ PACKAGE_IN_COMPILED_DATAFILE_HANDLERS.append(handler)
except ImportError:
pass
try:
from rust_inspector.packages import get_rust_binary_handler
- APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_rust_binary_handler())
+ handler = get_rust_binary_handler()
+ PACKAGE_IN_COMPILED_DATAFILE_HANDLERS.append(handler)
except ImportError:
pass
@@ -262,7 +271,7 @@
APPLICATION_PACKAGE_DATAFILE_HANDLERS + [
p for p in SYSTEM_PACKAGE_DATAFILE_HANDLERS
if p not in APPLICATION_PACKAGE_DATAFILE_HANDLERS
- ]
+ ] + PACKAGE_IN_COMPILED_DATAFILE_HANDLERS
)
# registry of all handler classes keyed by datasource_id
diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py
new file mode 100644
index 00000000000..52703717e39
--- /dev/null
+++ b/src/packagedcode/cache.py
@@ -0,0 +1,277 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# ScanCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/scancode-toolkit for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import os
+import fnmatch
+import pickle
+import multiregex
+
+import attr
+import click
+
+from collections import defaultdict
+
+from commoncode.fileutils import create_dir
+from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
+from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS
+
+from scancode_config import packagedcode_cache_dir
+from scancode_config import scancode_cache_dir
+
+"""
+An on-disk persistent cache of package manifest patterns and related package
+manifest handlers mapping. Loading and dumping the cached package manifest
+patterns is safe to use across multiple processes using lock files.
+"""
+
+# global in-memory cache of the PkgManifestPatternsCache
+_PACKAGE_CACHE = None
+
+# This is the Pickle protocol we use, which was added in Python 3.4.
+PICKLE_PROTOCOL = 4
+
+PACKAGE_INDEX_LOCK_TIMEOUT = 60
+PACKAGE_INDEX_DIR = 'package_patterns_index'
+PACKAGE_INDEX_FILENAME = 'index_cache'
+PACKAGE_LOCKFILE_NAME = 'scancode_package_index_lockfile'
+
+
+@attr.s
+class PkgManifestPatternsCache:
+ """
+ Represent cachable package manifest regex patterns, prematchers
+ and mappings from regex patterns to datasource IDs for all datafile
+ handlers.
+ """
+
+ handler_by_regex = attr.ib(default=attr.Factory(dict))
+ system_package_matcher = attr.ib(default=None)
+ application_package_matcher = attr.ib(default=None)
+ all_package_matcher = attr.ib(default=None)
+
+ @staticmethod
+ def all_multiregex_patterns(application_multiregex_patterns, system_multiregex_patterns):
+ return application_multiregex_patterns + [
+ multiregex_pattern
+ for multiregex_pattern in system_multiregex_patterns
+ if multiregex_pattern not in application_multiregex_patterns
+ ]
+
+ @classmethod
+ def load_or_build(
+ cls,
+ packagedcode_cache_dir=packagedcode_cache_dir,
+ scancode_cache_dir=scancode_cache_dir,
+ force=False,
+ timeout=PACKAGE_INDEX_LOCK_TIMEOUT,
+ system_package_datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS,
+ application_package_datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS,
+ ):
+ """
+ Load or build and save and return a PkgManifestPatternsCache object.
+
+ We either load a cached PkgManifestPatternsCache or build and cache the patterns.
+
+ - If the cache exists, it is returned unless corrupted.
+ - If ``force`` is True, or if the cache does not exist a new index is built
+ and cached.
+ """
+ idx_cache_dir = os.path.join(packagedcode_cache_dir, PACKAGE_INDEX_DIR)
+ create_dir(idx_cache_dir)
+ cache_file = os.path.join(idx_cache_dir, PACKAGE_INDEX_FILENAME)
+ has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file)
+
+ # bypass build if cache exists
+ if has_cache and not force:
+ try:
+ return load_cache_file(cache_file)
+ except Exception as e:
+ # work around some rare Windows quirks
+ import traceback
+ print('Inconsistent Package cache: rebuilding index.')
+ print(str(e))
+ print(traceback.format_exc())
+
+ from scancode import lockfile
+ lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME)
+
+ # here, we have no cache: lock, check and rebuild
+ try:
+ # acquire lock and wait until timeout to get a lock or die
+ with lockfile.FileLock(lock_file).locked(timeout=timeout):
+
+ system_multiregexes = build_mappings_and_multiregex_patterns(
+ datafile_handlers=system_package_datafile_handlers,
+ )
+ application_multiregexes = build_mappings_and_multiregex_patterns(
+ datafile_handlers=application_package_datafile_handlers,
+ )
+ all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns(
+ application_multiregex_patterns=application_multiregexes.patterns,
+ system_multiregex_patterns=system_multiregexes.patterns,
+ )
+ system_package_matcher = multiregex.RegexMatcher(system_multiregexes.patterns)
+ application_package_matcher = multiregex.RegexMatcher(application_multiregexes.patterns)
+ all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher)
+ handler_by_regex = (
+ system_multiregexes.handler_by_regex |
+ application_multiregexes.handler_by_regex
+ )
+ package_cache = cls(
+ handler_by_regex=handler_by_regex,
+ system_package_matcher=system_package_matcher,
+ application_package_matcher=application_package_matcher,
+ all_package_matcher=all_package_matcher,
+ )
+ package_cache.dump(cache_file)
+ return package_cache
+
+ except lockfile.LockTimeout:
+ # TODO: handle unable to lock in a nicer way
+ raise
+
+ def dump(self, cache_file):
+ """
+ Dump this license cache on disk at ``cache_file``.
+ """
+ with open(cache_file, 'wb') as fn:
+ pickle.dump(self, fn, protocol=PICKLE_PROTOCOL)
+
+
+def get_prematchers_from_glob_pattern(pattern):
+ """
+ Get a list of prematchers required to initialize the
+ multiregex matchers for a package manifest pattern.
+
+ Prematchers are words that must be present for a pattern to
+ be matched, and this acts as a pre-matching filter for fast
+ matching.
+ >>> get_prematchers_from_glob_pattern('*pyproject.toml')
+ ['pyproject.toml']
+ """
+ return [
+ prematcher.lower().lstrip("/")
+ for prematcher in pattern.split("*")
+ if prematcher
+ ]
+
+@attr.s
+class AcceleratedPattern():
+ regex :str = attr.ib(default=None) # regular expression string
+ prematchers :list[str] = attr.ib(default=[]) # list of prematcher strinsg for this regex
+ handler_datasource_ids :list[str] = attr.ib(default=[]) # handler
+
+
+@attr.s
+class MultiRegexPatternsandMappings:
+ multiregex_patterns :list[AcceleratedPattern] = attr.ib(default=[])
+ handler_by_regex :dict = attr.ib(default={})
+
+ @property
+ def patterns(self):
+ return [
+ (pattern.regex, pattern.prematchers)
+ for pattern in self.multiregex_patterns
+ ]
+
+
+def build_mappings_and_multiregex_patterns(datafile_handlers):
+ """
+ Return a mapping of regex patterns to datafile handler IDs and
+ multiregex patterns consisting of regex patterns and prematchers.
+ """
+ handler_by_regex = defaultdict(list)
+ multiregex_patterns = []
+
+ if not datafile_handlers:
+ return multiregex_patterns, handler_by_regex
+
+ with_patterns = []
+
+ for handler in datafile_handlers:
+ if handler.path_patterns:
+ with_patterns.append(handler)
+
+ prematchers_by_regex = {}
+
+ for handler in with_patterns:
+ for pattern in handler.path_patterns:
+ regex_pattern = fnmatch.translate(pattern)
+ regex_pattern = fr"{regex_pattern}"
+
+ prematchers_by_regex[regex_pattern] = get_prematchers_from_glob_pattern(pattern)
+
+ if regex_pattern in handler_by_regex:
+ handler_by_regex[regex_pattern].append(handler.datasource_id)
+ else:
+ handler_by_regex[regex_pattern]= [handler.datasource_id]
+
+ for regex, handler_ids in handler_by_regex.items():
+ regex_and_prematcher = AcceleratedPattern(
+ regex=regex,
+ prematchers=prematchers_by_regex.get(regex, []),
+ handler_datasource_ids=handler_ids,
+ )
+ multiregex_patterns.append(regex_and_prematcher)
+
+ return MultiRegexPatternsandMappings(
+ handler_by_regex=handler_by_regex,
+ multiregex_patterns=multiregex_patterns,
+ )
+
+
+def get_cache(
+ force=False,
+ packagedcode_cache_dir=packagedcode_cache_dir,
+ scancode_cache_dir=scancode_cache_dir,
+):
+ """
+ Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk.
+ """
+ global _PACKAGE_CACHE
+
+ if force or not _PACKAGE_CACHE:
+ _PACKAGE_CACHE = PkgManifestPatternsCache.load_or_build(
+ packagedcode_cache_dir=packagedcode_cache_dir,
+ scancode_cache_dir=scancode_cache_dir,
+ force=force,
+ # used for testing only
+ timeout=PACKAGE_INDEX_LOCK_TIMEOUT,
+ )
+ return _PACKAGE_CACHE
+
+
+def load_cache_file(cache_file):
+ """
+ Return a PkgManifestPatternsCache loaded from ``cache_file``.
+ """
+ with open(cache_file, 'rb') as lfc:
+ try:
+ return pickle.load(lfc)
+ except Exception as e:
+ msg = (
+ 'ERROR: Failed to load package cache (the file may be corrupted ?).\n'
+ f'Please delete "{cache_file}" and retry.\n'
+ 'If the problem persists, copy this error message '
+ 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/'
+ )
+ raise Exception(msg) from e
+
+
+@click.command(name='scancode-reindex-package-patterns')
+@click.help_option('-h', '--help')
+def cache_package_patterns(*args, **kwargs):
+ """Create scancode package manifest patterns cache and exit"""
+ click.echo('Rebuilding the package cache patterns...')
+ get_cache(force=True)
+ click.echo('Done.')
+
+
+if __name__ == '__main__':
+ cache_package_patterns()
diff --git a/src/packagedcode/data/.gitignore b/src/packagedcode/data/.gitignore
new file mode 100644
index 00000000000..0a2101fab9b
--- /dev/null
+++ b/src/packagedcode/data/.gitignore
@@ -0,0 +1 @@
+/cache/
diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py
index 8dc993e3b7a..e887ebc860b 100644
--- a/src/packagedcode/plugin_package.py
+++ b/src/packagedcode/plugin_package.py
@@ -170,6 +170,19 @@ class PackageScanner(ScanPlugin):
help_group=SCAN_GROUP,
sort_order=21,
),
+ PluggableCommandLineOption(
+ (
+ '--package-in-compiled',
+ ),
+ is_flag=True,
+ default=False,
+ help=(
+ 'Scan for package and dependency related data in compiled binaries. '
+ 'Currently supported compiled binaries: Go, Rust.'
+ ),
+ help_group=SCAN_GROUP,
+ sort_order=22,
+ ),
PluggableCommandLineOption(
(
'--package-only',
@@ -182,7 +195,7 @@ class PackageScanner(ScanPlugin):
'license/copyright detection and top-level package creation.'
),
help_group=SCAN_GROUP,
- sort_order=22,
+ sort_order=23,
),
PluggableCommandLineOption(
('--list-packages',),
@@ -195,10 +208,17 @@ class PackageScanner(ScanPlugin):
),
]
- def is_enabled(self, package, system_package, package_only, **kwargs):
- return package or system_package or package_only
+ def is_enabled(self, package, system_package, package_in_compiled, package_only, **kwargs):
+ return package or system_package or package_in_compiled or package_only
- def get_scanner(self, package=True, system_package=False, package_only=False, **kwargs):
+ def get_scanner(
+ self,
+ package=True,
+ system_package=False,
+ package_in_compiled=False,
+ package_only=False,
+ **kwargs
+ ):
"""
Return a scanner callable to scan a file for package data.
"""
@@ -208,6 +228,7 @@ def get_scanner(self, package=True, system_package=False, package_only=False, **
get_package_data,
application=package,
system=system_package,
+ binary=package_in_compiled,
package_only=package_only,
)
@@ -464,7 +485,7 @@ def get_package_and_deps(codebase, package_adder=add_to_package, strip_root=Fals
resource.scan_errors.append(msg)
resource.save(codebase)
- if TRACE:
+ if TRACE_ASSEMBLY:
raise Exception(msg) from e
return packages, dependencies
diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py
index e41d29c82df..7744a550eff 100644
--- a/src/packagedcode/recognize.py
+++ b/src/packagedcode/recognize.py
@@ -11,10 +11,12 @@
import sys
from commoncode import filetype
-from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
-from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS
-from packagedcode import ALL_DATAFILE_HANDLERS
+from commoncode.fileutils import as_posixpath
+
+from packagedcode import HANDLER_BY_DATASOURCE_ID
+from packagedcode import PACKAGE_IN_COMPILED_DATAFILE_HANDLERS
from packagedcode import models
+from packagedcode.cache import get_cache
TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False)
@@ -44,6 +46,7 @@ def recognize_package_data(
location,
application=True,
system=False,
+ compiled=False,
package_only=False,
):
"""
@@ -56,25 +59,21 @@ def recognize_package_data(
if not filetype.is_file(location):
return []
- assert application or system or package_only
- if package_only or (application and system):
- datafile_handlers = ALL_DATAFILE_HANDLERS
- elif application:
- datafile_handlers = APPLICATION_PACKAGE_DATAFILE_HANDLERS
- elif system:
- datafile_handlers = SYSTEM_PACKAGE_DATAFILE_HANDLERS
-
return list(_parse(
location=location,
+ application=application,
+ system=system,
+ compiled=compiled,
package_only=package_only,
- datafile_handlers=datafile_handlers,
))
def _parse(
location,
+ application=True,
+ system=False,
+ compiled=False,
package_only=False,
- datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS,
):
"""
Yield parsed PackageData objects from ``location``. Raises Exceptions on errors.
@@ -83,6 +82,46 @@ def _parse(
Default to use application packages
"""
+ package_path = as_posixpath(location)
+ package_patterns = get_cache()
+
+ has_patterns = application or system or package_only
+ assert has_patterns or compiled
+ if package_only or (application and system):
+ package_matcher = package_patterns.all_package_matcher
+ elif application:
+ package_matcher = package_patterns.application_package_matcher
+ elif system:
+ package_matcher = package_patterns.system_package_matcher
+
+ matched_patterns = []
+ if has_patterns:
+ matched_patterns = package_matcher.match(package_path)
+
+ all_handler_ids = []
+ for matched_pattern in matched_patterns:
+ regex, _match = matched_pattern
+ handler_ids = package_patterns.handler_by_regex.get(regex.pattern)
+ if TRACE:
+ logger_debug(f'_parse:.handler_ids: {handler_ids}')
+
+ all_handler_ids.extend([
+ handler_id
+ for handler_id in handler_ids
+ if handler_id not in all_handler_ids
+ ])
+
+ datafile_handlers = [
+ HANDLER_BY_DATASOURCE_ID.get(handler_id)
+ for handler_id in all_handler_ids
+ ]
+
+ if not datafile_handlers:
+ if compiled:
+ datafile_handlers.extend(PACKAGE_IN_COMPILED_DATAFILE_HANDLERS)
+ elif TRACE:
+ logger_debug(f'_parse: no package datafile detected at {package_path}')
+
for handler in datafile_handlers:
if TRACE:
logger_debug(f'_parse:.is_datafile: {handler}')
diff --git a/src/packagedcode/rubygems.py b/src/packagedcode/rubygems.py
index 9cbbf6d7553..e80295c48af 100644
--- a/src/packagedcode/rubygems.py
+++ b/src/packagedcode/rubygems.py
@@ -211,7 +211,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
# TODO: https://stackoverflow.com/questions/41454333/meaning-of-new-block-git-sourcegithub-in-gemfile
class GemfileHandler(GemspecHandler):
datasource_id = 'gemfile'
- path_patterns = ('*/Gemfile', '*/*.gemfile', '*/Gemfile-*')
+ path_patterns = ('*/Gemfile', '*.gemfile', '*/Gemfile-*')
default_package_type = 'gem'
default_primary_language = 'Ruby'
description = 'RubyGems Bundler Gemfile'
diff --git a/src/scancode/api.py b/src/scancode/api.py
index 94592e20ce1..71382f4a6a0 100644
--- a/src/scancode/api.py
+++ b/src/scancode/api.py
@@ -256,20 +256,28 @@ def get_licenses(
SCANCODE_DEBUG_PACKAGE_API = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False)
-def _get_package_data(location, application=True, system=False, package_only=False, **kwargs):
+def _get_package_data(
+ location,
+ application=True,
+ system=False,
+ compiled=False,
+ package_only=False,
+ **kwargs
+):
"""
Return a mapping of package manifest information detected in the file at ``location``.
Include ``application`` packages (such as pypi) and/or ``system`` packages.
Note that all exceptions are caught if there are any errors while parsing a
package manifest.
"""
- assert application or system or package_only
+ assert application or system or compiled or package_only
from packagedcode.recognize import recognize_package_data
try:
return recognize_package_data(
location=location,
application=application,
system=system,
+ compiled=compiled,
package_only=package_only,
) or []
@@ -300,7 +308,14 @@ def get_package_info(location, **kwargs):
return dict(packages=[p.to_dict() for p in packages])
-def get_package_data(location, application=True, system=False, package_only=False, **kwargs):
+def get_package_data(
+ location,
+ application=True,
+ system=False,
+ compiled=False,
+ package_only=False,
+ **kwargs
+):
"""
Return a mapping of package manifest information detected in the file at
`location`.
@@ -313,6 +328,7 @@ def get_package_data(location, application=True, system=False, package_only=Fals
location=location,
application=application,
system=system,
+ compiled=compiled,
package_only=package_only,
**kwargs,
) or []
diff --git a/src/scancode_config.py b/src/scancode_config.py
index 9b6e2b7d075..520a0af9396 100644
--- a/src/scancode_config.py
+++ b/src/scancode_config.py
@@ -185,7 +185,13 @@ def _create_dir(location):
__env_license_cache_dir = os.getenv('SCANCODE_LICENSE_INDEX_CACHE')
licensedcode_cache_dir = (__env_license_cache_dir or std_license_cache_dir)
+
+std_package_cache_dir = join(scancode_src_dir, 'packagedcode', 'data', 'cache')
+__env_package_cache_dir = os.getenv('SCANCODE_PACKAGE_INDEX_CACHE')
+packagedcode_cache_dir = (__env_package_cache_dir or std_package_cache_dir)
+
_create_dir(licensedcode_cache_dir)
+_create_dir(packagedcode_cache_dir)
_create_dir(scancode_cache_dir)
# - scancode_temp_dir: for short-lived temporary files which are import- or run-
diff --git a/tests/packagedcode/data/cache/.gitignore b/tests/packagedcode/data/cache/.gitignore
new file mode 100644
index 00000000000..a738fbc8f7f
--- /dev/null
+++ b/tests/packagedcode/data/cache/.gitignore
@@ -0,0 +1 @@
+/package_patterns_index/
\ No newline at end of file
diff --git a/tests/packagedcode/data/plugin/plugins_list_linux.txt b/tests/packagedcode/data/plugin/plugins_list_linux.txt
index e24512dfd91..eb4763d6c7e 100755
--- a/tests/packagedcode/data/plugin/plugins_list_linux.txt
+++ b/tests/packagedcode/data/plugin/plugins_list_linux.txt
@@ -410,7 +410,7 @@ Package type: gem
documentation URL: https://bundler.io/man/gemfile.5.html
primary language: Ruby
description: RubyGems Bundler Gemfile
- path_patterns: '*/Gemfile', '*/*.gemfile', '*/Gemfile-*'
+ path_patterns: '*/Gemfile', '*.gemfile', '*/Gemfile-*'
--------------------------------------------
Package type: gem
datasource_id: gemfile_extracted
diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py
new file mode 100644
index 00000000000..27ff079219c
--- /dev/null
+++ b/tests/packagedcode/test_cache.py
@@ -0,0 +1,66 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# ScanCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/scancode-toolkit for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import fnmatch
+import os
+
+from packagedcode import cache
+from commoncode.fileutils import as_posixpath
+
+from packages_test_utils import PackageTester
+
+
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
+
+
+class TestMultiregexPatterns(PackageTester):
+ test_data_dir = TEST_DATA_DIR
+
+ def test_build_mappings_and_multiregex_patterns_works(self):
+ from packagedcode.about import AboutFileHandler
+
+ multiregexes = cache.build_mappings_and_multiregex_patterns(
+ datafile_handlers=[AboutFileHandler],
+ )
+ assert multiregexes.patterns == [('(?s:.*\\.ABOUT)\\Z', ['.about'])]
+ assert multiregexes.handler_by_regex == {'(?s:.*\\.ABOUT)\\Z': ['about_file']}
+
+ def test_build_package_cache_works(self):
+ from packagedcode.about import AboutFileHandler
+ from packagedcode.bower import BowerJsonHandler
+
+ package_cache_dir = self.get_test_loc('cache/')
+ package_cache = cache.PkgManifestPatternsCache.load_or_build(
+ packagedcode_cache_dir=package_cache_dir,
+ application_package_datafile_handlers=[AboutFileHandler],
+ system_package_datafile_handlers=[BowerJsonHandler],
+ force=True,
+ )
+ test_path = "scancode-toolkit.ABOUT"
+
+ assert not package_cache.system_package_matcher.match(test_path)
+ assert package_cache.application_package_matcher.match(test_path)
+
+ regex, _match = package_cache.all_package_matcher.match(test_path).pop()
+ assert package_cache.handler_by_regex.get(regex.pattern).pop() == AboutFileHandler.datasource_id
+
+ def test_empty_file_scan_works(self):
+
+ test_file = self.get_test_loc('cache/.gitignore')
+ package_path = as_posixpath(test_file)
+ package_matcher = cache.get_cache()
+
+ assert not package_matcher.all_package_matcher.match(package_path)
+
+ def test_get_prematchers_from_glob_pattern(self):
+
+ from packagedcode.pypi import PyprojectTomlHandler
+
+ prematchers = cache.get_prematchers_from_glob_pattern(PyprojectTomlHandler.path_patterns[0])
+ assert "pyproject.toml" in prematchers
diff --git a/tests/packagedcode/test_cargo.py b/tests/packagedcode/test_cargo.py
index e3309a438e2..2f6a2baa796 100644
--- a/tests/packagedcode/test_cargo.py
+++ b/tests/packagedcode/test_cargo.py
@@ -168,7 +168,7 @@ def test_scan_works_on_rust_binary_with_inspector(self):
test_file = self.get_test_loc('cargo/binary/cargo_dependencies')
expected_file = self.get_test_loc('cargo/binary/cargo-binary.expected.json')
result_file = self.get_temp_file('results.json')
- run_scan_click(['--package', test_file, '--json', result_file])
+ run_scan_click(['--package-in-compiled', test_file, '--json', result_file])
check_json_scan(
expected_file, result_file, remove_uuid=True, regen=REGEN_TEST_FIXTURES
)
diff --git a/tests/packagedcode/test_recognize.py b/tests/packagedcode/test_recognize.py
index f7736aeeb61..98a50164321 100644
--- a/tests/packagedcode/test_recognize.py
+++ b/tests/packagedcode/test_recognize.py
@@ -202,3 +202,8 @@ def test_recognize_rpmdb_sqlite(self):
packages = recognize_package_data(test_file, system=True)
assert packages
assert isinstance(packages[0], models.PackageData)
+
+ def test_recognize_non_package_manifest_file(self):
+ test_file = self.get_test_loc('cache/.gitignore')
+ packages = recognize_package_data(test_file)
+ assert not packages
diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt
index 8a486871b5d..f2ccbb6dc33 100644
--- a/tests/scancode/data/help/help.txt
+++ b/tests/scancode/data/help/help.txt
@@ -8,13 +8,17 @@ Usage: scancode [OPTIONS]