Skip to content

Commit

Permalink
Merge pull request #487 from nexB/increase-scannableuri-priority
Browse files Browse the repository at this point in the history
Improve ondemand data collection for npm and maven
  • Loading branch information
JonoYang authored Jul 16, 2024
2 parents 6498c87 + fc4a1cc commit 275d6da
Show file tree
Hide file tree
Showing 20 changed files with 655 additions and 60 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ RUN apt-get update \
libldap-dev \
openssl \
wait-for-it \
git \
cvs \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

Expand Down
1 change: 1 addition & 0 deletions etc/nginx/conf.d/default.conf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ server {
proxy_set_header Host $host;
proxy_redirect off;
client_max_body_size 10G;
client_body_buffer_size 256M;
proxy_read_timeout 600s;
}

Expand Down
Empty file modified manage_matchcode.py
100644 → 100755
Empty file.
Empty file modified manage_purldb.py
100644 → 100755
Empty file.
308 changes: 308 additions & 0 deletions matchcode/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from django.db.models import Q
from django.forms import widgets
from django.forms.fields import MultipleChoiceField
from django_filters.filters import MultipleChoiceFilter
from django_filters.rest_framework import FilterSet
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.serializers import CharField
from rest_framework.serializers import FloatField
from rest_framework.serializers import HyperlinkedRelatedField
from rest_framework.serializers import ModelSerializer
from rest_framework.serializers import ReadOnlyField
from rest_framework.serializers import Serializer
from rest_framework.viewsets import ReadOnlyModelViewSet

from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode_toolkit.fingerprinting import split_fingerprint
from matchcode_toolkit.halohash import byte_hamming_distance
from matchcode.models import ExactFileIndex
from matchcode.models import ExactPackageArchiveIndex
from matchcode.models import ApproximateDirectoryContentIndex
from matchcode.models import ApproximateDirectoryStructureIndex


class BaseFileIndexSerializer(ModelSerializer):
sha1 = CharField(source='fingerprint')
package = HyperlinkedRelatedField(
view_name='api:package-detail',
lookup_field='uuid',
read_only=True
)


class ExactFileIndexSerializer(BaseFileIndexSerializer):
class Meta:
model = ExactFileIndex
fields = (
'sha1',
'package'
)


class ExactPackageArchiveIndexSerializer(BaseFileIndexSerializer):
class Meta:
model = ExactPackageArchiveIndex
fields = (
'sha1',
'package'
)


class BaseDirectoryIndexSerializer(ModelSerializer):
fingerprint = ReadOnlyField()
package = HyperlinkedRelatedField(
view_name='api:package-detail',
lookup_field='uuid',
read_only=True
)

class ApproximateDirectoryContentIndexSerializer(BaseDirectoryIndexSerializer):
class Meta:
model = ApproximateDirectoryContentIndex
fields = (
'fingerprint',
'package',
)


class ApproximateDirectoryStructureIndexSerializer(BaseDirectoryIndexSerializer):
class Meta:
model = ApproximateDirectoryStructureIndex
fields = (
'fingerprint',
'package',
)


class BaseDirectoryIndexMatchSerializer(Serializer):
fingerprint = CharField()
matched_fingerprint = CharField()
package = HyperlinkedRelatedField(
view_name='api:package-detail',
lookup_field='uuid',
read_only=True
)
similarity_score = FloatField()


class CharMultipleWidget(widgets.TextInput):
"""
Enables the support for `MultiValueDict` `?field=a&field=b`
reusing the `SelectMultiple.value_from_datadict()` but render as a `TextInput`.
"""
def value_from_datadict(self, data, files, name):
value = widgets.SelectMultiple().value_from_datadict(data, files, name)
if not value or value == ['']:
return ''

return value

def format_value(self, value):
"""
Return a value as it should appear when rendered in a template.
"""
return ', '.join(value)


class MultipleCharField(MultipleChoiceField):
"""
Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`.
"""
widget = CharMultipleWidget

def valid_value(self, value):
return True


class MultipleCharFilter(MultipleChoiceFilter):
"""
Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax.
"""
field_class = MultipleCharField


# TODO: Think of a better name for this filter
class MultipleCharInFilter(MultipleCharFilter):
def filter(self, qs, value):
if not value:
# Even though not a noop, no point filtering if empty.
return qs

if self.is_noop(qs, value):
return qs

predicate = self.get_filter_predicate(value)
old_field_name = next(iter(predicate))
new_field_name = f'{old_field_name}__in'
predicate[new_field_name] = predicate[old_field_name]
predicate.pop(old_field_name)

q = Q(**predicate)
qs = self.get_method(qs)(q)

return qs.distinct() if self.distinct else qs


class MultipleSHA1Filter(MultipleCharFilter):
"""
Overrides `MultipleCharFilter.filter()` to convert the SHA1
into a bytearray so it can be queried
"""
def filter(self, qs, value):
if not value:
return qs

q = Q()
for val in value:
v = hexstring_to_binarray(val)
q.add(Q(sha1=v), Q.OR)

return qs.filter(q)


class MultipleFingerprintFilter(MultipleCharFilter):
"""
Overrides `MultipleCharFilter.filter()` to process fingerprint from a single
string into multiple values used for querying.
In the BaseDirectoryIndex model, the fingerprint is stored in four chunks of
equal size, not as a single field that contains the entire fingerprint. We
must process the fingerprint into the correct parts so we can use those
parts to query the different fields.
"""
def filter(self, qs, value):
if not value:
return qs

q = Q()
for val in value:
indexed_elements_count, bah128 = split_fingerprint(val)
chunk1, chunk2, chunk3, chunk4 = create_halohash_chunks(bah128)
q.add(
Q(
indexed_elements_count=indexed_elements_count,
chunk1=chunk1,
chunk2=chunk2,
chunk3=chunk3,
chunk4=chunk4
),
Q.OR
)

return qs.filter(q)


class BaseFileIndexFilterSet(FilterSet):
sha1 = MultipleSHA1Filter()


class ExactFileIndexFilterSet(BaseFileIndexFilterSet):
class Meta:
model = ExactFileIndex
fields = (
'sha1',
)


class ExactPackageArchiveFilterSet(BaseFileIndexFilterSet):
class Meta:
model = ExactPackageArchiveIndex
fields = (
'sha1',
)


class BaseDirectoryIndexFilterSet(FilterSet):
fingerprint = MultipleFingerprintFilter()


class ApproximateDirectoryContentFilterSet(BaseDirectoryIndexFilterSet):
class Meta:
model = ApproximateDirectoryContentIndex
fields = (
'fingerprint',
)


class ApproximateDirectoryStructureFilterSet(BaseDirectoryIndexFilterSet):
class Meta:
model = ApproximateDirectoryStructureIndex
fields = (
'fingerprint',
)


class BaseFileIndexViewSet(ReadOnlyModelViewSet):
lookup_field = 'sha1'


class ExactFileIndexViewSet(BaseFileIndexViewSet):
queryset = ExactFileIndex.objects.all()
serializer_class = ExactFileIndexSerializer
filterset_class = ExactFileIndexFilterSet


class ExactPackageArchiveIndexViewSet(BaseFileIndexViewSet):
queryset = ExactPackageArchiveIndex.objects.all()
serializer_class = ExactPackageArchiveIndexSerializer
filterset_class = ExactPackageArchiveFilterSet


class BaseDirectoryIndexViewSet(ReadOnlyModelViewSet):
lookup_field = 'fingerprint'

@action(detail=False)
def match(self, request):
fingerprints = request.query_params.getlist('fingerprint')
if not fingerprints:
return Response()

model_class = self.get_serializer().Meta.model
results = []
unique_fingerprints = set(fingerprints)
for fingerprint in unique_fingerprints:
matches = model_class.match(fingerprint)
for match in matches:
_, bah128 = split_fingerprint(fingerprint)
# Get fingerprint from the match
fp = match.fingerprint()
_, match_bah128 = split_fingerprint(fp)
hd = byte_hamming_distance(bah128, match_bah128)
similarity_score = (128 - hd) / 128
results.append(
{
'fingerprint': fingerprint,
'matched_fingerprint': fp,
'package': match.package,
'similarity_score': similarity_score,
}
)

serialized_match_results = BaseDirectoryIndexMatchSerializer(
results,
context={'request': request},
many=True
)
return Response(serialized_match_results.data)


class ApproximateDirectoryContentIndexViewSet(BaseDirectoryIndexViewSet):
queryset = ApproximateDirectoryContentIndex.objects.all()
serializer_class = ApproximateDirectoryContentIndexSerializer
filterset_class = ApproximateDirectoryContentFilterSet


class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet):
queryset = ApproximateDirectoryStructureIndex.objects.all()
serializer_class = ApproximateDirectoryStructureIndexSerializer
filterset_class = ApproximateDirectoryStructureFilterSet
9 changes: 9 additions & 0 deletions minecode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,12 @@ def update_status(self, request, *args, **kwargs):
}

return Response(msg)

@action(detail=False, methods=['get'])
def statistics(self, request, *args, **kwargs):
"""
Return a scan queue statistics.
"""
response = ScannableURI.objects.statistics()
return Response(response)

5 changes: 4 additions & 1 deletion minecode/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,16 @@ def index_package(scannable_uri, package, scan_data, summary_data, project_extra
'copyright': copyright,
**checksums_and_size_by_field
}
# do not override fields with empty values
values_by_updateable_fields = {k: v for k, v in values_by_updateable_fields.items() if v}

_, updated_fields = package.update_fields(save=True, **values_by_updateable_fields)
updated_fields = ', '.join(updated_fields)
message = f'Updated fields for Package {package.purl}: {updated_fields}'
logger.info(message)
scannable_uri.scan_status = ScannableURI.SCAN_INDEXED
scannable_uri.save()
except Exception as e:
except Exception:
traceback_message = traceback.format_exc()
error_message = traceback_message + '\n'
# TODO: We should rerun the specific indexers that have failed
Expand Down
Loading

0 comments on commit 275d6da

Please sign in to comment.