diff --git a/Dockerfile b/Dockerfile index 4431efc5..64447f8b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,8 @@ RUN apt-get update \ libldap-dev \ openssl \ wait-for-it \ + git \ + cvs \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/etc/nginx/conf.d/default.conf b/etc/nginx/conf.d/default.conf index c24f2894..dcfa5ec7 100644 --- a/etc/nginx/conf.d/default.conf +++ b/etc/nginx/conf.d/default.conf @@ -11,6 +11,7 @@ server { proxy_set_header Host $host; proxy_redirect off; client_max_body_size 10G; + client_body_buffer_size 256M; proxy_read_timeout 600s; } diff --git a/manage_matchcode.py b/manage_matchcode.py old mode 100644 new mode 100755 diff --git a/manage_purldb.py b/manage_purldb.py old mode 100644 new mode 100755 diff --git a/matchcode/api.py b/matchcode/api.py new file mode 100644 index 00000000..9810645a --- /dev/null +++ b/matchcode/api.py @@ -0,0 +1,308 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +from django.db.models import Q +from django.forms import widgets +from django.forms.fields import MultipleChoiceField +from django_filters.filters import MultipleChoiceFilter +from django_filters.rest_framework import FilterSet +from rest_framework.decorators import action +from rest_framework.response import Response +from rest_framework.serializers import CharField +from rest_framework.serializers import FloatField +from rest_framework.serializers import HyperlinkedRelatedField +from rest_framework.serializers import ModelSerializer +from rest_framework.serializers import ReadOnlyField +from rest_framework.serializers import Serializer +from rest_framework.viewsets import ReadOnlyModelViewSet + +from matchcode_toolkit.fingerprinting import create_halohash_chunks +from matchcode_toolkit.fingerprinting import hexstring_to_binarray +from matchcode_toolkit.fingerprinting import split_fingerprint +from matchcode_toolkit.halohash import byte_hamming_distance +from matchcode.models import ExactFileIndex +from matchcode.models import ExactPackageArchiveIndex +from matchcode.models import ApproximateDirectoryContentIndex +from matchcode.models import ApproximateDirectoryStructureIndex + + +class BaseFileIndexSerializer(ModelSerializer): + sha1 = CharField(source='fingerprint') + package = HyperlinkedRelatedField( + view_name='api:package-detail', + lookup_field='uuid', + read_only=True + ) + + +class ExactFileIndexSerializer(BaseFileIndexSerializer): + class Meta: + model = ExactFileIndex + fields = ( + 'sha1', + 'package' + ) + + +class ExactPackageArchiveIndexSerializer(BaseFileIndexSerializer): + class Meta: + model = ExactPackageArchiveIndex + fields = ( + 'sha1', + 'package' + ) + + +class BaseDirectoryIndexSerializer(ModelSerializer): + fingerprint = ReadOnlyField() + package = HyperlinkedRelatedField( + view_name='api:package-detail', + lookup_field='uuid', + read_only=True + ) + +class ApproximateDirectoryContentIndexSerializer(BaseDirectoryIndexSerializer): + class Meta: + model = ApproximateDirectoryContentIndex + fields = ( + 'fingerprint', + 'package', + ) + + +class ApproximateDirectoryStructureIndexSerializer(BaseDirectoryIndexSerializer): + class Meta: + model = ApproximateDirectoryStructureIndex + fields = ( + 'fingerprint', + 'package', + ) + + +class BaseDirectoryIndexMatchSerializer(Serializer): + fingerprint = CharField() + matched_fingerprint = CharField() + package = HyperlinkedRelatedField( + view_name='api:package-detail', + lookup_field='uuid', + read_only=True + ) + similarity_score = FloatField() + + +class CharMultipleWidget(widgets.TextInput): + """ + Enables the support for `MultiValueDict` `?field=a&field=b` + reusing the `SelectMultiple.value_from_datadict()` but render as a `TextInput`. + """ + def value_from_datadict(self, data, files, name): + value = widgets.SelectMultiple().value_from_datadict(data, files, name) + if not value or value == ['']: + return '' + + return value + + def format_value(self, value): + """ + Return a value as it should appear when rendered in a template. + """ + return ', '.join(value) + + +class MultipleCharField(MultipleChoiceField): + """ + Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`. + """ + widget = CharMultipleWidget + + def valid_value(self, value): + return True + + +class MultipleCharFilter(MultipleChoiceFilter): + """ + Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax. + """ + field_class = MultipleCharField + + +# TODO: Think of a better name for this filter +class MultipleCharInFilter(MultipleCharFilter): + def filter(self, qs, value): + if not value: + # Even though not a noop, no point filtering if empty. + return qs + + if self.is_noop(qs, value): + return qs + + predicate = self.get_filter_predicate(value) + old_field_name = next(iter(predicate)) + new_field_name = f'{old_field_name}__in' + predicate[new_field_name] = predicate[old_field_name] + predicate.pop(old_field_name) + + q = Q(**predicate) + qs = self.get_method(qs)(q) + + return qs.distinct() if self.distinct else qs + + +class MultipleSHA1Filter(MultipleCharFilter): + """ + Overrides `MultipleCharFilter.filter()` to convert the SHA1 + into a bytearray so it can be queried + """ + def filter(self, qs, value): + if not value: + return qs + + q = Q() + for val in value: + v = hexstring_to_binarray(val) + q.add(Q(sha1=v), Q.OR) + + return qs.filter(q) + + +class MultipleFingerprintFilter(MultipleCharFilter): + """ + Overrides `MultipleCharFilter.filter()` to process fingerprint from a single + string into multiple values used for querying. + + In the BaseDirectoryIndex model, the fingerprint is stored in four chunks of + equal size, not as a single field that contains the entire fingerprint. We + must process the fingerprint into the correct parts so we can use those + parts to query the different fields. + """ + def filter(self, qs, value): + if not value: + return qs + + q = Q() + for val in value: + indexed_elements_count, bah128 = split_fingerprint(val) + chunk1, chunk2, chunk3, chunk4 = create_halohash_chunks(bah128) + q.add( + Q( + indexed_elements_count=indexed_elements_count, + chunk1=chunk1, + chunk2=chunk2, + chunk3=chunk3, + chunk4=chunk4 + ), + Q.OR + ) + + return qs.filter(q) + + +class BaseFileIndexFilterSet(FilterSet): + sha1 = MultipleSHA1Filter() + + +class ExactFileIndexFilterSet(BaseFileIndexFilterSet): + class Meta: + model = ExactFileIndex + fields = ( + 'sha1', + ) + + +class ExactPackageArchiveFilterSet(BaseFileIndexFilterSet): + class Meta: + model = ExactPackageArchiveIndex + fields = ( + 'sha1', + ) + + +class BaseDirectoryIndexFilterSet(FilterSet): + fingerprint = MultipleFingerprintFilter() + + +class ApproximateDirectoryContentFilterSet(BaseDirectoryIndexFilterSet): + class Meta: + model = ApproximateDirectoryContentIndex + fields = ( + 'fingerprint', + ) + + +class ApproximateDirectoryStructureFilterSet(BaseDirectoryIndexFilterSet): + class Meta: + model = ApproximateDirectoryStructureIndex + fields = ( + 'fingerprint', + ) + + +class BaseFileIndexViewSet(ReadOnlyModelViewSet): + lookup_field = 'sha1' + + +class ExactFileIndexViewSet(BaseFileIndexViewSet): + queryset = ExactFileIndex.objects.all() + serializer_class = ExactFileIndexSerializer + filterset_class = ExactFileIndexFilterSet + + +class ExactPackageArchiveIndexViewSet(BaseFileIndexViewSet): + queryset = ExactPackageArchiveIndex.objects.all() + serializer_class = ExactPackageArchiveIndexSerializer + filterset_class = ExactPackageArchiveFilterSet + + +class BaseDirectoryIndexViewSet(ReadOnlyModelViewSet): + lookup_field = 'fingerprint' + + @action(detail=False) + def match(self, request): + fingerprints = request.query_params.getlist('fingerprint') + if not fingerprints: + return Response() + + model_class = self.get_serializer().Meta.model + results = [] + unique_fingerprints = set(fingerprints) + for fingerprint in unique_fingerprints: + matches = model_class.match(fingerprint) + for match in matches: + _, bah128 = split_fingerprint(fingerprint) + # Get fingerprint from the match + fp = match.fingerprint() + _, match_bah128 = split_fingerprint(fp) + hd = byte_hamming_distance(bah128, match_bah128) + similarity_score = (128 - hd) / 128 + results.append( + { + 'fingerprint': fingerprint, + 'matched_fingerprint': fp, + 'package': match.package, + 'similarity_score': similarity_score, + } + ) + + serialized_match_results = BaseDirectoryIndexMatchSerializer( + results, + context={'request': request}, + many=True + ) + return Response(serialized_match_results.data) + + +class ApproximateDirectoryContentIndexViewSet(BaseDirectoryIndexViewSet): + queryset = ApproximateDirectoryContentIndex.objects.all() + serializer_class = ApproximateDirectoryContentIndexSerializer + filterset_class = ApproximateDirectoryContentFilterSet + + +class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): + queryset = ApproximateDirectoryStructureIndex.objects.all() + serializer_class = ApproximateDirectoryStructureIndexSerializer + filterset_class = ApproximateDirectoryStructureFilterSet diff --git a/minecode/api.py b/minecode/api.py index ff534d93..ba5a3b03 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -226,3 +226,12 @@ def update_status(self, request, *args, **kwargs): } return Response(msg) + + @action(detail=False, methods=['get']) + def statistics(self, request, *args, **kwargs): + """ + Return a scan queue statistics. + """ + response = ScannableURI.objects.statistics() + return Response(response) + diff --git a/minecode/indexing.py b/minecode/indexing.py index f1c238d7..ba6a1e36 100644 --- a/minecode/indexing.py +++ b/minecode/indexing.py @@ -114,13 +114,16 @@ def index_package(scannable_uri, package, scan_data, summary_data, project_extra 'copyright': copyright, **checksums_and_size_by_field } + # do not override fields with empty values + values_by_updateable_fields = {k: v for k, v in values_by_updateable_fields.items() if v} + _, updated_fields = package.update_fields(save=True, **values_by_updateable_fields) updated_fields = ', '.join(updated_fields) message = f'Updated fields for Package {package.purl}: {updated_fields}' logger.info(message) scannable_uri.scan_status = ScannableURI.SCAN_INDEXED scannable_uri.save() - except Exception as e: + except Exception: traceback_message = traceback.format_exc() error_message = traceback_message + '\n' # TODO: We should rerun the specific indexers that have failed diff --git a/minecode/model_utils.py b/minecode/model_utils.py index 4888d3d8..97e875a8 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -44,7 +44,7 @@ ) -def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, reindex_uri=False, priority=0): +def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, reindex_uri=False, priority=100): """ Add a Package `package` to the scan queue to run the list of provided `pipelines` @@ -219,19 +219,27 @@ def merge_packages(existing_package, new_package_data, replace=False): return updated_fields -def merge_or_create_package(scanned_package, visit_level): +def merge_or_create_package(scanned_package, visit_level, override=False): """ - Update Package from `scanned_package` instance if `visit_level` is greater + Update Package from ``scanned_package`` instance if `visit_level` is greater than the mining level of the existing package. - If `scanned_package` does not exist in the PackageDB, create a new entry in - the PackageDB for `scanned_package`. + If ``scanned_package`` does not exist in the PackageDB, create a new entry in + the PackageDB for ``scanned_package``. + + If ``override`` is True, then all existing empty values of the PackageDB package are replaced by + a non-empty value of the provided override. """ created = False merged = False package = None map_error = '' + mining_level = visit_level + if override: + # this will force the data override + visit_level =+1 + if not isinstance(scanned_package, PackageData): msg = 'Not a ScanCode PackageData type:' + repr(scanned_package) map_error += msg + '\n' @@ -297,7 +305,7 @@ def merge_or_create_package(scanned_package, visit_level): # package and the existing package, the existing package parties should be # deleted first and then the new package's parties added. - stored_package.mining_level = visit_level + stored_package.mining_level = mining_level if updated_fields: data = { @@ -335,7 +343,7 @@ def merge_or_create_package(scanned_package, visit_level): filename=fileutils.file_name(package_uri), # TODO: update the PackageDB model release_date=scanned_package.release_date, - mining_level=visit_level, + mining_level=mining_level, type=scanned_package.type, namespace=scanned_package.namespace, name=scanned_package.name, diff --git a/minecode/models.py b/minecode/models.py index 50c6c513..206b4c1a 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -27,7 +27,6 @@ from packagedb.models import Package - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -515,6 +514,7 @@ def save(self, *args, **kwargs): class ScannableURIManager(models.Manager): + def get_scannables(self): """ Return an ordered query set of all scannable ScannableURIs. @@ -616,6 +616,48 @@ def get_next_processable(self): """ return self.__get_next_candidate(self.get_processables()) + def statistics(self): + """ + Return a statistics mapping with summary counts of ScannableURI grouped by status. + """ + statuses = list(self.values('scan_status').annotate(count=models.Count('scan_status')).order_by('scan_status'),) + for stat in statuses: + stat['scan_status'] = ScannableURI.SCAN_STATUSES_BY_CODE[stat['scan_status']] + stats = { + 'total': self.count(), + 'processables': self.get_processables().count(), + 'scannables': self.get_scannables().count(), + 'by_status': statuses, + } + + most_recent = dict( + most_recent_submitted=self._recent(scan_status=ScannableURI.SCAN_SUBMITTED), + most_recent_indexed=self._recent(scan_status=ScannableURI.SCAN_INDEXED), + most_recent_failed=self._recent(scan_status=ScannableURI.SCAN_FAILED, extra_value="scan_error",), + most_recent_in_progress=self._recent(scan_status=ScannableURI.SCAN_IN_PROGRESS), + most_recent_completed=self._recent(scan_status=ScannableURI.SCAN_COMPLETED), + most_recent_index_errors=self._recent(scan_status=ScannableURI.SCAN_INDEX_FAILED, extra_value="index_error",), + ) + stats.update(most_recent) + return stats + + def _recent(self, scan_status, extra_value=None, most_recent=10): + """ + Yield mappings of the ``most_recent`` PURL and download URL with a given + ``scan_status``. + Include an optional ``extra value`` field name. + """ + recent_uris = self.filter(scan_status=scan_status).order_by('-scan_date')[:most_recent] + for scauri in recent_uris: + recent = dict( + # this is NOT a field requiring this loop + package_url=scauri.package.package_url, + download_url=scauri.package.download_url, + ) + if extra_value: + recent[extra_value] = getattr(scauri, extra_value) + yield recent + class ScannableURI(BaseURI): """ @@ -794,6 +836,7 @@ def process_scan_results( # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class PriorityResourceURIManager(models.Manager): + def insert(self, uri, **extra_fields): """ Create and return a new PriorityResourceURI after computing its canonical URI @@ -962,6 +1005,7 @@ def save(self, *args, **kwargs): # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class ImportableURIManager(models.Manager): + def insert(self, uri, data, package_url, **extra_fields): """ Create and return a new ImportableURI @@ -1027,10 +1071,10 @@ def get_next_request(self): importable_uri.save(update_fields=['wip_date']) return importable_uri - # TODO: have a second queue for crawling maven repo, that tracks which pages and namespaces we visited # when we hit the point of a package page, we add it to the queue that creates skinny packages for the package we visited. + class ImportableURI(BaseURI): package_url = models.CharField( max_length=2048, diff --git a/minecode/visitors/generic.py b/minecode/visitors/generic.py index 67ba344a..4eb044dd 100644 --- a/minecode/visitors/generic.py +++ b/minecode/visitors/generic.py @@ -15,6 +15,7 @@ from packageurl import PackageURL from minecode import priority_router +from packagedb.models import PackageContentType """ Collect generic packages from a download URL. @@ -28,7 +29,7 @@ def map_generic_package(package_url, pipelines): """ - Add a npm `package_url` to the PackageDB. + Add a generic `package_url` to the PackageDB. Return an error string if any errors are encountered during the process """ @@ -44,8 +45,8 @@ def map_generic_package(package_url, pipelines): qualifiers=package_url.qualifiers, subpath=package_url.subpath, download_url=download_url, + extra_data=dict(package_content=PackageContentType.BINARY), ) - # TODO: set package_content type db_package, _, _, error = merge_or_create_package(package, visit_level=0) @@ -158,6 +159,7 @@ def map_fetchcode_supported_package(package_url, pipelines): "pkg:generic/erofs-utils@.*", ] + # Indexing some generic PURLs requires a GitHub API token. # Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. @priority_router.route(*GENERIC_FETCHCODE_SUPPORTED_PURLS) diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 98cc069f..5d7a1133 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -64,7 +64,6 @@ if TRACE: logger.setLevel(logging.DEBUG) - MAVEN_BASE_URL = 'https://repo1.maven.org/maven2' @@ -243,11 +242,13 @@ def merge_ancestors(ancestor_pom_texts, package): return package -def map_maven_package(package_url, package_content, pipelines): +def map_maven_package(package_url, package_content, pipelines, reindex_metadata=False): """ Add a maven `package_url` to the PackageDB. Return an error string if errors have occured in the process. + + if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. """ from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package @@ -308,20 +309,22 @@ def map_maven_package(package_url, package_content, pipelines): sha1 = get_package_sha1(package) if sha1: package.sha1 = sha1 - db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + override = reindex_metadata + db_package, _, _, _ = merge_or_create_package(package, visit_level=50, override=override) else: msg = f'Failed to retrieve JAR: {package_url}' error += msg + '\n' logger.error(msg) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package, pipelines) + + if not reindex_metadata: + # Submit package for scanning + if db_package: + add_package_to_scan_queue(package=db_package, pipelines=pipelines) return db_package, error -def map_maven_binary_and_source(package_url, pipelines): +def map_maven_binary_and_source(package_url, pipelines, reindex_metadata=False): """ Get metadata for the binary and source release of the Maven package `package_url` and save it to the PackageDB. @@ -329,19 +332,27 @@ def map_maven_binary_and_source(package_url, pipelines): Return an error string for errors that occur, or empty string if there is no error. """ error = '' - package, emsg = map_maven_package(package_url, PackageContentType.BINARY, pipelines) + package, emsg = map_maven_package( + package_url=package_url, + package_content=PackageContentType.BINARY, + pipelines=pipelines, + reindex_metadata=reindex_metadata, + ) if emsg: error += emsg source_package_url = package_url source_package_url.qualifiers['classifier'] = 'sources' source_package, emsg = map_maven_package( - source_package_url, PackageContentType.SOURCE_ARCHIVE, pipelines + package_url=source_package_url, + package_content=PackageContentType.SOURCE_ARCHIVE, + pipelines=pipelines, + reindex_metadata=reindex_metadata, ) if emsg: error += emsg - if package and source_package: + if not reindex_metadata and package and source_package: make_relationship( from_package=source_package, to_package=package, @@ -419,10 +430,11 @@ def process_request(purl_str, **kwargs): Return an error string for errors that occur, or empty string if there is no error. """ from minecode.model_utils import DEFAULT_PIPELINES - + addon_pipelines = kwargs.get('addon_pipelines', []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + try: package_url = PackageURL.from_string(purl_str) except ValueError as e: @@ -431,7 +443,8 @@ def process_request(purl_str, **kwargs): has_version = bool(package_url.version) if has_version: - error = map_maven_binary_and_source(package_url, pipelines) + reindex_metadata=kwargs.get("reindex_metadata", False) + error = map_maven_binary_and_source(package_url, pipelines, reindex_metadata=reindex_metadata) else: error = map_maven_packages(package_url, pipelines) @@ -1103,7 +1116,6 @@ def is_source(classifier): """ return classifier and ('source' in classifier or 'src' in classifier) - ######################################################################## # DOCUMENTAION OF the FIELDS aka. Records: # @@ -1495,11 +1507,11 @@ def java_time_ts(tm): ar = arrow.get(tm / 1000).replace(tzinfo=tzinfo).to('utc') return ar.isoformat() - ################################################################################ # These are CLI/shell test and stat utilities ################################################################################ + def _spit_json(location, target): with open(target, 'w') as t: t.write('[\n') diff --git a/packagedb/api.py b/packagedb/api.py index 2c3c01fe..6fa9b6e3 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -651,10 +651,11 @@ def create(self, request): def get_enhanced_package(package): """ Return package data from `package`, where the data has been enhanced by - other packages in the same package_set. + other packages in the same first_package_in_set. """ package_content = package.package_content in_package_sets = package.package_sets.count() > 0 + if ( not in_package_sets or not package_content @@ -665,21 +666,23 @@ def get_enhanced_package(package): # Source repo packages can't really be enhanced much further, datawise # and we can't enhance a package that is not in a package set. return package.to_dict() - if package_content in [PackageContentType.BINARY, PackageContentType.SOURCE_ARCHIVE]: + + elif package_content in [PackageContentType.BINARY, PackageContentType.SOURCE_ARCHIVE]: # Binary packages can only be part of one set # TODO: Can source_archive packages be part of multiple sets? - package_set = package.package_sets.first() - if package_set: - package_set_members = package_set.get_package_set_members() + first_package_in_set = package.package_sets.first() + if first_package_in_set: + package_set_members = first_package_in_set.get_package_set_members() if package_content == PackageContentType.SOURCE_ARCHIVE: # Mix data from SOURCE_REPO packages for SOURCE_ARCHIVE packages package_set_members = package_set_members.filter( package_content=PackageContentType.SOURCE_REPO ) # TODO: consider putting in the history field that we enhanced the data - return _get_enhanced_package(package, package_set_members) - else: - return package.to_dict() + return _get_enhanced_package(package=package, packages=package_set_members) + else: + # if not enhanced return the package as-is + return package.to_dict() def _get_enhanced_package(package, packages): @@ -688,10 +691,20 @@ def _get_enhanced_package(package, packages): `packages`. """ package_data = package.to_dict() + + # always default to PackageContentType.BINARY as we can have None/NULL in the model for now + # Reference: https://github.com/nexB/purldb/issues/490 + package_content = (package and package.package_content) or PackageContentType.BINARY + for peer in packages: - if peer.package_content >= package.package_content: + # always default to PackageContentType.BINARY as we can have None/NULL in the model for now + # Reference: https://github.com/nexB/purldb/issues/490 + peer_content = (peer and peer.package_content) or PackageContentType.BINARY + + if peer_content >= package_content: # We do not want to mix data with peers of the same package content continue + enhanced = False for field in UPDATEABLE_FIELDS: package_value = package_data.get(field) @@ -709,6 +722,7 @@ def _get_enhanced_package(package, packages): enhanced_by.append(peer.purl) extra_data['enhanced_by'] = enhanced_by package_data['extra_data'] = extra_data + return package_data @@ -1016,6 +1030,73 @@ def _reindex_package(package, reindexed_packages, **kwargs): serializer = IndexPackagesResponseSerializer(response_data, context={'request': request}) return Response(serializer.data) + @extend_schema( + parameters=[ + OpenApiParameter('purl', str, 'query', description='PackageURL', required=True), + ], + responses={200:PackageAPISerializer()}, + ) + @action(detail=False, methods=['get'], serializer_class=CollectPackageSerializer) + def reindex_metadata(self, request, *args, **kwargs): + """ + Collect or recollect the package metadata of a ``PURL`` string. + Also recollects all packages in the set of the PURL. + + If the PURL does exist, calling thios endpoint with re-collect, re-store and return the + Package metadata immediately, + + If the package does not exist in the database this call does nothing. + NOTE: this WILL NOT re-run scan and indexing in the background in contrast with the /collect + and collect/index_packages endpoints. + + **Request example**:: + + /api/collect/reindex_metadata/?purl=pkg:npm/foo@0.0.7 + + """ + serializer = self.serializer_class(data=request.query_params) + if not serializer.is_valid(): + return Response( + {'errors': serializer.errors}, + status=status.HTTP_400_BAD_REQUEST, + ) + + validated_data = serializer.validated_data + purl = validated_data.get('purl') + + lookups = purl_to_lookups(purl) + packages = Package.objects.filter(**lookups) + if packages.count() == 0: + return Response( + {'status': f'Not recollecting: Package does not exist for {purl}'}, + status=status.HTTP_400_BAD_REQUEST, + ) + + # Pass to only reindex_metadata downstream + kwargs["reindex_metadata"] = True + # here we have a package(s) matching our purl and we want to recollect metadata live + try: + errors = priority_router.process(purl, **kwargs) + except NoRouteAvailable: + message = { + 'status': f'cannot fetch Package data for {purl}: no available handler' + } + return Response(message, status=status.HTTP_400_BAD_REQUEST) + + lookups = purl_to_lookups(purl) + packages = Package.objects.filter(**lookups) + if packages.count() == 0: + message = {} + if errors: + message = { + 'status': f'error(s) occurred when fetching metadata for {purl}: {errors}' + } + return Response(message) + + serializer = PackageAPISerializer(packages, many=True, context={'request': request}) + return Response(serializer.data) + + class PurlValidateViewSet(viewsets.ViewSet): """ diff --git a/packagedb/models.py b/packagedb/models.py index 139976db..dd0ad36a 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -1211,13 +1211,17 @@ def __str__(self): def make_relationship( - from_package, to_package, relationship + from_package, to_package, relationship, ): - return PackageRelation.objects.create( + """ + Create and return the from/to package relathionship if it does exists. + """ + pkg, _created = PackageRelation.objects.get_or_create( from_package=from_package, to_package=to_package, relationship=relationship, ) + return pkg class PackageWatch(models.Model): diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index a2983858..f8f384ca 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -269,7 +269,7 @@ def setUp(self): self.package3 = Package.objects.create(**self.package_data3) self.package3.refresh_from_db() - self.package_data4= { + self.package_data4 = { 'type': 'jar', 'namespace': 'sample', 'name': 'Baz', @@ -286,7 +286,7 @@ def setUp(self): self.package4 = Package.objects.create(**self.package_data4) self.package4.refresh_from_db() - self.package_data5= { + self.package_data5 = { 'type': 'maven', 'namespace': 'foot', 'name': 'baz', @@ -305,7 +305,7 @@ def setUp(self): self.package5 = Package.objects.create(**self.package_data5) self.package5.refresh_from_db() - self.package_data6= { + self.package_data6 = { 'type': 'maven', 'namespace': 'fooo', 'name': 'baz', @@ -323,7 +323,7 @@ def setUp(self): self.package6 = Package.objects.create(**self.package_data6) self.package6.refresh_from_db() - self.package_data7= { + self.package_data7 = { 'type': 'github', 'namespace': 'glue', 'name': 'cat', @@ -463,7 +463,6 @@ def test_api_package_list_endpoint_multiple_char_filters(self): self.assertIn(self.package3.purl, purls) self.assertNotIn(self.package.purl, purls) - def test_package_api_filter_by_checksums(self): sha1s = [ 'testsha1', @@ -814,6 +813,33 @@ def test_package_live(self): self.check_expected_results(result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN) + def test_package_live_works_with_purl2vcs(self): + purl = "pkg:maven/org.elasticsearch.plugin/elasticsearch-scripting-painless-spi@6.8.15" + download_url = 'https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15.jar' + purl_sources_str = f'{purl}?classifier=sources' + sources_download_url = 'https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15-sources.jar' + + self.assertEqual(0, Package.objects.filter(download_url=download_url).count()) + self.assertEqual(0, Package.objects.filter(download_url=sources_download_url).count()) + response = self.client.get(f'/api/collect/?purl={purl}') + self.assertEqual(1, Package.objects.filter(download_url=download_url).count()) + self.assertEqual(1, Package.objects.filter(download_url=sources_download_url).count()) + expected = self.get_test_loc('api/elasticsearch-scripting-painless-spi-6.8.15.json') + + self.assertEqual(2, len(response.data)) + result = response.data[0] + + # remove fields + result.pop('url') + fields_to_remove = [ + 'uuid', + 'resources', + 'package_sets', + 'history' + ] + + self.check_expected_results(result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN) + def test_package_api_index_packages_endpoint(self): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) @@ -1131,7 +1157,7 @@ def setUp(self): 'sha1': 'testsha1', 'md5': 'testmd5', 'size': 101, - 'package_content' : 1 + 'package_content': 1 } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() @@ -1157,7 +1183,7 @@ def test_api_purl_updation_existing_package(self): "purls": [ {"purl": "pkg:npm/foobar@1.1.0", "content_type": "PATCH"} ], - "uuid" : str(self.new_package_set_uuid) + "uuid": str(self.new_package_set_uuid) } expected = [{"purl": "pkg:npm/foobar@1.1.0", "update_status": "Already Exists"}] @@ -1171,7 +1197,7 @@ def test_api_purl_updation_non_existing_uuid(self): "purls": [ {"purl": "pkg:npm/foobar@1.1.0", "content_type": "SOURCE_REPO"} ], - "uuid" : "ac9c36f4-a1ed-4824-8448-c6ed8f1da71d" + "uuid": "ac9c36f4-a1ed-4824-8448-c6ed8f1da71d" } expected = {"update_status": "No Package Set found for ac9c36f4-a1ed-4824-8448-c6ed8f1da71d"} @@ -1193,7 +1219,6 @@ def test_api_purl_updation_without_uuid(self): self.assertEqual(expected, response.data) - def test_api_purl_validation_empty_request(self): data = {} response = self.client.post(f"/api/update_packages/", data=data, content_type="application/json") @@ -1260,7 +1285,6 @@ def test_api_purl_validation_unsupported_package_type(self): } response1 = self.client.get(f"/api/validate/", data=data1) - self.assertEqual(True, response1.data["valid"]) self.assertEqual( "The provided PackageURL is valid, but `check_existence` is not supported for this package type.", response1.data["message"] diff --git a/packagedb/tests/testfiles/api/elasticsearch-scripting-painless-spi-6.8.15.json b/packagedb/tests/testfiles/api/elasticsearch-scripting-painless-spi-6.8.15.json new file mode 100644 index 00000000..1d551c0d --- /dev/null +++ b/packagedb/tests/testfiles/api/elasticsearch-scripting-painless-spi-6.8.15.json @@ -0,0 +1,87 @@ +{ + "filename":"elasticsearch-scripting-painless-spi-6.8.15.jar", + "package_content":"binary", + "purl":"pkg:maven/org.elasticsearch.plugin/elasticsearch-scripting-painless-spi@6.8.15", + "type":"maven", + "namespace":"org.elasticsearch.plugin", + "name":"elasticsearch-scripting-painless-spi", + "version":"6.8.15", + "qualifiers":"", + "subpath":"", + "primary_language":"Java", + "description":"spi\nElasticsearch subproject :modules:lang-painless:spi", + "release_date":null, + "parties":[ + { + "type":"person", + "role":"developer", + "name":"Elastic", + "email":null, + "url":"http://www.elastic.co" + } + ], + "keywords":[], + "homepage_url":"https://github.com/elastic/elasticsearch", + "download_url":"https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15.jar", + "bug_tracking_url":null, + "code_view_url":"https://github.com/elastic/elasticsearch.git", + "vcs_url":"https://github.com/elastic/elasticsearch.git", + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":null, + "md5":null, + "sha1":"f4d81f7e8a9729f8dfb84d988c904bda9765cb56", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":"apache-2.0", + "declared_license_expression_spdx":"Apache-2.0", + "license_detections":[ + { + "matches":[ + { + "score":100.0, + "matcher":"1-hash", + "end_line":2, + "rule_url":"https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-2.0_40.RULE", + "from_file":null, + "start_line":1, + "matched_text":"- name: The Apache Software License, Version 2.0\n url: http://www.apache.org/licenses/LICENSE-2.0.txt", + "match_coverage":100.0, + "matched_length":18, + "rule_relevance":100, + "rule_identifier":"apache-2.0_40.RULE", + "license_expression":"apache-2.0", + "spdx_license_expression":"Apache-2.0" + } + ], + "identifier":"apache_2_0-bfa9e97a-62d3-0076-c881-8443e5e95192", + "license_expression":"apache-2.0", + "license_expression_spdx":"Apache-2.0" + } + ], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":"- name: The Apache Software License, Version 2.0\n url: http://www.apache.org/licenses/LICENSE-2.0.txt\n", + "notice_text":null, + "source_packages":[ + "pkg:maven/org.elasticsearch.plugin/elasticsearch-scripting-painless-spi@6.8.15?classifier=sources" + ], + "extra_data":{}, + "package_uid":"pkg:maven/org.elasticsearch.plugin/elasticsearch-scripting-painless-spi@6.8.15?uuid=fixed-uid-done-for-testing-5642512d1758", + "datasource_id":null, + "file_references":[], + "dependencies":[ + { + "purl":"pkg:maven/org.elasticsearch/elasticsearch@6.8.15", + "extracted_requirement":"6.8.15", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + } + ] +} \ No newline at end of file diff --git a/purl2vcs/CHANGELOG.rst b/purl2vcs/CHANGELOG.rst index c5a54f7a..56dc7c65 100644 --- a/purl2vcs/CHANGELOG.rst +++ b/purl2vcs/CHANGELOG.rst @@ -1,6 +1,13 @@ Changelog ========= +v1.0.2 +------ + +Fix bug when collecting versions. +Reference: https://github.com/nexB/purldb/issues/486 + + v1.0.1 ------ diff --git a/purl2vcs/pyproject.toml b/purl2vcs/pyproject.toml index 4c459daa..f7e4516f 100644 --- a/purl2vcs/pyproject.toml +++ b/purl2vcs/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "flot.buildapi" [project] name = "purl2vcs" # keep version same as purldb for now -version = "1.0.1" +version = "1.0.2" description = "purl2vcs is an add-on library working with the PurlDB to find the version control system (VCS) URL of a package and detect the commit, tags and path for a given version." readme = "README.rst" license = { text = "Apache-2.0" } diff --git a/purl2vcs/src/purl2vcs/find_source_repo.py b/purl2vcs/src/purl2vcs/find_source_repo.py index b662a21a..34db4861 100644 --- a/purl2vcs/src/purl2vcs/find_source_repo.py +++ b/purl2vcs/src/purl2vcs/find_source_repo.py @@ -168,7 +168,7 @@ def get_source_package_and_add_to_package_set(package): logger.info(f"Created source repo package {source_purl} for {package.purl}") package_set_uuids = [item["uuid"] for item in package.package_sets.all().values("uuid")] package_set_ids = set(package_set_uuids) - source_package_set_ids = set(source_package.package_sets.all().values("uuid")) + source_package_set_ids = set(source_package.package_sets.all().values_list("uuid")) # If the package exists and already in the set then there is nothing left to do if package_set_ids.intersection(source_package_set_ids): diff --git a/purldb_project/urls.py b/purldb_project/urls.py index 90b7d04e..e60e692c 100644 --- a/purldb_project/urls.py +++ b/purldb_project/urls.py @@ -11,22 +11,22 @@ from django.urls import path from django.views.generic import RedirectView from django.views.generic.base import TemplateView - +from drf_spectacular.views import SpectacularAPIView +from drf_spectacular.views import SpectacularSwaggerView from rest_framework import routers -from packagedb.api import PackageViewSet +from matchcode.api import ApproximateDirectoryContentIndexViewSet +from matchcode.api import ApproximateDirectoryStructureIndexViewSet +from minecode.api import ScannableURIViewSet +from packagedb.api import CollectViewSet from packagedb.api import PackageSetViewSet from packagedb.api import PackageUpdateSet +from packagedb.api import PackageViewSet from packagedb.api import PackageWatchViewSet -from packagedb.api import ResourceViewSet -from minecode.api import ScannableURIViewSet from packagedb.api import PurlValidateViewSet -from packagedb.to_purl import api_to_purl_router +from packagedb.api import ResourceViewSet from packagedb.from_purl import api_from_purl_router -from packagedb.api import CollectViewSet -from drf_spectacular.views import SpectacularAPIView -from drf_spectacular.views import SpectacularSwaggerView - +from packagedb.to_purl import api_to_purl_router api_router = routers.DefaultRouter() api_router.register('packages', PackageViewSet) @@ -37,6 +37,9 @@ api_router.register('collect', CollectViewSet, 'collect') api_router.register('watch',PackageWatchViewSet) api_router.register('scan_queue', ScannableURIViewSet) +api_router.register('approximate_directory_content_index', ApproximateDirectoryContentIndexViewSet) +api_router.register('approximate_directory_structure_index', ApproximateDirectoryStructureIndexViewSet) + urlpatterns = [ path( diff --git a/setup.cfg b/setup.cfg index 7d10e896..184a0f40 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,7 +60,7 @@ install_requires = scancode-toolkit[packages] @ git+https://github.com/nexB/scancode-toolkit.git@684360f2ca01bc676368bc8621eed65065bf0f11 urlpy == 0.5 matchcode-toolkit == 5.1.0 - purl2vcs == 1.0.1 + purl2vcs == 1.0.2 univers == 30.11.0 scancodeio @ git+https://github.com/nexB/scancode.io.git@07b48c0224f5c2ad1b2972b693702ef685f16c98 setup_requires = setuptools_scm[toml] >= 4