Use wildcard for paths in maven CF invalidating

* And ignore the indexing(index.html) CF invalidating per cost consideration * And change the root work dir for npm uploading
Commonjava · Mar 28, 2024 · a52ba7a · a52ba7a
1 parent a92f24e
commit a52ba7a
Show file tree

Hide file tree

Showing 9 changed files with 118 additions and 76 deletions.
diff --git a/charon/cache.py b/charon/cache.py
@@ -59,38 +59,53 @@ def __get_endpoint(self, extra_conf) -> str:
             logger.debug("[CloudFront] No user-specified endpoint url is used.")
         return endpoint_url
 
-    def invalidate_paths(self, distr_id: str, paths: List[str]) -> Dict[str, str]:
+    def invalidate_paths(
+        self, distr_id: str, paths: List[str],
+        batch_size: int = 15
+    ) -> List[Dict[str, str]]:
         """Send a invalidating requests for the paths in distribution to CloudFront.
         This will invalidate the paths in the distribution to enforce the refreshment
         from backend S3 bucket for these paths. For details see:
         https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/Invalidation.html
             * The distr_id is the id for the distribution. This id can be get through
             get_dist_id_by_domain(domain) function
             * Can specify the invalidating paths through paths param.
+            * Batch size is the number of paths to be invalidated in one request.
+            Because paths contains wildcard(*), so the default value is 15 which
+            is the maximum number in official doc:
+            https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/Invalidation.html#InvalidationLimits
         """
-        caller_ref = str(uuid.uuid4())
         logger.debug("[CloudFront] Creating invalidation for paths: %s", paths)
-        try:
-            response = self.__client.create_invalidation(
-                DistributionId=distr_id,
-                InvalidationBatch={
-                    'CallerReference': caller_ref,
-                    'Paths': {
-                        'Quantity': len(paths),
-                        'Items': paths
+        real_paths = [paths]
+        # Split paths into batches by batch_size
+        if batch_size:
+            real_paths = [paths[i:i + batch_size] for i in range(0, len(paths), batch_size)]
+        results = []
+        for batch_paths in real_paths:
+            caller_ref = str(uuid.uuid4())
+            try:
+                response = self.__client.create_invalidation(
+                    DistributionId=distr_id,
+                    InvalidationBatch={
+                        'CallerReference': caller_ref,
+                        'Paths': {
+                            'Quantity': len(batch_paths),
+                            'Items': batch_paths
+                        }
                     }
-                }
-            )
-            if response:
-                invalidation = response.get('Invalidation', {})
-                return {
-                    'Id': invalidation.get('Id', None),
-                    'Status': invalidation.get('Status', None)
-                }
-        except Exception as err:
-            logger.error(
-                "[CloudFront] Error occurred while creating invalidation, error: %s", err
-            )
+                )
+                if response:
+                    invalidation = response.get('Invalidation', {})
+                    results.append({
+                        'Id': invalidation.get('Id', None),
+                        'Status': invalidation.get('Status', None)
+                    })
+            except Exception as err:
+                logger.error(
+                    "[CloudFront] Error occurred while creating invalidation"
+                    " for paths %s, error: %s", batch_paths, err
+                )
+        return results
 
     def check_invalidation(self, distr_id: str, invalidation_id: str) -> dict:
         try:
@@ -115,7 +130,7 @@ def get_dist_id_by_domain(self, domain: str) -> str:
         """Get distribution id by a domain name. The id can be used to send invalidating
         request through #invalidate_paths function
            * Domain are Ronda domains, like "maven.repository.redhat.com"
-           or "npm.repository.redhat.com"
+           or "npm.registry.redhat.com"
         """
         try:
             response = self.__client.list_distributions()

diff --git a/charon/pkgs/indexing.py b/charon/pkgs/indexing.py
@@ -15,8 +15,8 @@
 """
 from charon.config import get_template
 from charon.storage import S3Client
-from charon.cache import CFClient
-from charon.pkgs.pkg_utils import invalidate_cf_paths
+# from charon.cache import CFClient
+# from charon.pkgs.pkg_utils import invalidate_cf_paths
 from charon.constants import (INDEX_HTML_TEMPLATE, NPM_INDEX_HTML_TEMPLATE,
                               PACKAGE_TYPE_MAVEN, PACKAGE_TYPE_NPM, PROD_INFO_SUFFIX)
 from charon.utils.files import digest_content
@@ -265,7 +265,7 @@ def re_index(
     path: str,
     package_type: str,
     aws_profile: str = None,
-    cf_enable: bool = False,
+    # cf_enable: bool = False,
     dry_run: bool = False
 ):
     """Refresh the index.html for the specified folder in the bucket.
@@ -312,9 +312,10 @@ def re_index(
                 index_path, index_content, (bucket_name, real_prefix),
                 "text/html", digest_content(index_content)
             )
-            if cf_enable:
-                cf_client = CFClient(aws_profile=aws_profile)
-                invalidate_cf_paths(cf_client, bucket, [index_path])
+            # We will not invalidate index.html per cost consideration
+            # if cf_enable:
+            #     cf_client = CFClient(aws_profile=aws_profile)
+            #     invalidate_cf_paths(cf_client, bucket, [index_path])
     else:
         logger.warning(
             "The path %s does not contain any contents in bucket %s. "

diff --git a/charon/pkgs/maven.py b/charon/pkgs/maven.py
@@ -58,7 +58,9 @@ def __get_mvn_template(kind: str, default: str) -> str:
 
 META_TEMPLATE = __get_mvn_template("maven-metadata.xml.j2", MAVEN_METADATA_TEMPLATE)
 ARCH_TEMPLATE = __get_mvn_template("archetype-catalog.xml.j2", ARCHETYPE_CATALOG_TEMPLATE)
-STANDARD_GENERATED_IGNORES = ["maven-metadata.xml", "archetype-catalog.xml"]
+MAVEN_METADATA_FILE = "maven-metadata.xml"
+MAVEN_ARCH_FILE = "archetype-catalog.xml"
+STANDARD_GENERATED_IGNORES = [MAVEN_METADATA_FILE, MAVEN_ARCH_FILE]
 
 
 class MavenMetadata(object):
@@ -219,7 +221,7 @@ def gen_meta_file(group_id, artifact_id: str, versions: list, root="/", digest=T
     ).generate_meta_file_content()
     g_path = "/".join(group_id.split("."))
     meta_files = []
-    final_meta_path = os.path.join(root, g_path, artifact_id, "maven-metadata.xml")
+    final_meta_path = os.path.join(root, g_path, artifact_id, MAVEN_METADATA_FILE)
     try:
         overwrite_file(final_meta_path, content)
         meta_files.append(final_meta_path)
@@ -374,7 +376,7 @@ def handle_maven_uploading(
                 cf_invalidate_paths.extend(meta_files.get(META_FILE_GEN_KEY, []))
 
         # 8. Determine refreshment of archetype-catalog.xml
-        if os.path.exists(os.path.join(top_level, "archetype-catalog.xml")):
+        if os.path.exists(os.path.join(top_level, MAVEN_ARCH_FILE)):
             logger.info("Start generating archetype-catalog.xml for bucket %s", bucket_name)
             upload_archetype_file = _generate_upload_archetype_catalog(
                 s3=s3_client, bucket=bucket_name,
@@ -451,15 +453,16 @@ def handle_maven_uploading(
             )
             failed_metas.extend(_failed_metas)
             logger.info("Index files updating done\n")
-            # Add index files to Cf invalidate paths
-            if cf_enable:
-                cf_invalidate_paths.extend(created_indexes)
+            # We will not invalidate the index files per cost consideration
+            # if cf_enable:
+            #     cf_invalidate_paths.extend(created_indexes)
         else:
             logger.info("Bypass indexing")
 
         # Finally do the CF invalidating for metadata files
         if cf_enable and len(cf_invalidate_paths) > 0:
             cf_client = CFClient(aws_profile=aws_profile)
+            cf_invalidate_paths = __wildcard_metadata_paths(cf_invalidate_paths)
             invalidate_cf_paths(cf_client, bucket, cf_invalidate_paths, top_level)
 
         upload_post_process(failed_files, failed_metas, prod_key, bucket_name)
@@ -578,7 +581,7 @@ def handle_maven_del(
             cf_invalidate_paths.extend(all_meta_files)
 
         # 7. Determine refreshment of archetype-catalog.xml
-        if os.path.exists(os.path.join(top_level, "archetype-catalog.xml")):
+        if os.path.exists(os.path.join(top_level, MAVEN_ARCH_FILE)):
             logger.info("Start generating archetype-catalog.xml")
             archetype_action = _generate_rollback_archetype_catalog(
                 s3=s3_client, bucket=bucket_name,
@@ -630,13 +633,15 @@ def handle_maven_del(
             if len(_failed_index_files) > 0:
                 failed_metas.extend(_failed_index_files)
             logger.info("Index files updating done.\n")
-            if cf_enable:
-                cf_invalidate_paths.extend(created_indexes)
+            # We will not invalidate the index files per cost consideration
+            # if cf_enable:
+            #     cf_invalidate_paths.extend(created_indexes)
         else:
             logger.info("Bypassing indexing")
 
         if cf_enable and len(cf_invalidate_paths):
             cf_client = CFClient(aws_profile=aws_profile)
+            cf_invalidate_paths = __wildcard_metadata_paths(cf_invalidate_paths)
             invalidate_cf_paths(cf_client, bucket, cf_invalidate_paths, top_level)
 
         rollback_post_process(failed_files, failed_metas, prod_key, bucket_name)
@@ -1017,15 +1022,15 @@ def _generate_metadatas(
                     "No poms found in s3 bucket %s for GA path %s", bucket, path
                 )
                 meta_files_deletion = meta_files.get(META_FILE_DEL_KEY, [])
-                meta_files_deletion.append(os.path.join(path, "maven-metadata.xml"))
-                meta_files_deletion.extend(__hash_decorate_metadata(path, "maven-metadata.xml"))
+                meta_files_deletion.append(os.path.join(path, MAVEN_METADATA_FILE))
+                meta_files_deletion.extend(__hash_decorate_metadata(path, MAVEN_METADATA_FILE))
                 meta_files[META_FILE_DEL_KEY] = meta_files_deletion
             else:
                 logger.warning("An error happened when scanning remote "
                                "artifacts under GA path %s", path)
                 meta_failed_path = meta_files.get(META_FILE_FAILED, [])
-                meta_failed_path.append(os.path.join(path, "maven-metadata.xml"))
-                meta_failed_path.extend(__hash_decorate_metadata(path, "maven-metadata.xml"))
+                meta_failed_path.append(os.path.join(path, MAVEN_METADATA_FILE))
+                meta_failed_path.extend(__hash_decorate_metadata(path, MAVEN_METADATA_FILE))
                 meta_files[META_FILE_FAILED] = meta_failed_path
         else:
             logger.debug(
@@ -1090,6 +1095,22 @@ def __get_suffix(package_type: str, conf: CharonConfig) -> List[str]:
     return []
 
 
+def __wildcard_metadata_paths(paths: List[str]) -> List[str]:
+    new_paths = []
+    for path in paths:
+        if path.endswith(MAVEN_METADATA_FILE)\
+           or path.endswith(MAVEN_ARCH_FILE):
+            new_paths.append(path[:-len(".xml")] + ".*")
+        elif path.endswith(".md5")\
+            or path.endswith(".sha1")\
+            or path.endswith(".sha128")\
+                or path.endswith(".sha256"):
+            continue
+        else:
+            new_paths.append(path)
+    return new_paths
+
+
 class VersionCompareKey:
     'Used as key function for version sorting'
     def __init__(self, obj):

diff --git a/charon/pkgs/npm.py b/charon/pkgs/npm.py
@@ -105,6 +105,8 @@ def handle_npm_uploading(
 
     client = S3Client(aws_profile=aws_profile, dry_run=dry_run)
     generated_signs = []
+    succeeded = True
+    root_dir = mkdtemp(prefix=f"npm-charon-{product}-", dir=dir_)
     for bucket in buckets:
         # prepare cf invalidate files
         cf_invalidate_paths = []
@@ -113,7 +115,7 @@ def handle_npm_uploading(
         prefix = remove_prefix(bucket[2], "/")
         registry = bucket[3]
         target_dir, valid_paths, package_metadata = _scan_metadata_paths_from_archive(
-            tarball_path, registry, prod=product, dir__=dir_
+            tarball_path, registry, prod=product, dir__=root_dir
         )
         if not os.path.isdir(target_dir):
             logger.error("Error: the extracted target_dir path %s does not exist.", target_dir)
@@ -129,8 +131,6 @@ def handle_npm_uploading(
         )
         logger.info("Files uploading done\n")
 
-        succeeded = True
-
         if not manifest_bucket_name:
             logger.warning(
                 'Warning: No manifest bucket is provided, will ignore the process of manifest '
@@ -235,8 +235,9 @@ def handle_npm_uploading(
             )
             failed_metas.extend(_failed_metas)
             logger.info("Index files updating done\n")
-            if cf_enable:
-                cf_invalidate_paths.extend(created_indexes)
+            # We will not invalidate the index files per cost consideration
+            # if cf_enable:
+            #     cf_invalidate_paths.extend(created_indexes)
         else:
             logger.info("Bypass indexing\n")
 
@@ -248,7 +249,7 @@ def handle_npm_uploading(
         upload_post_process(failed_files, failed_metas, product, bucket_name)
         succeeded = succeeded and len(failed_files) == 0 and len(failed_metas) == 0
 
-    return (target_dir, succeeded)
+    return (root_dir, succeeded)
 
 
 def handle_npm_del(
@@ -360,9 +361,10 @@ def handle_npm_del(
             )
             failed_metas.extend(_failed_index_files)
             logger.info("Index files updating done.\n")
-            if cf_enable and len(created_indexes):
-                logger.debug("Add index files to cf invalidate list: %s", created_indexes)
-                cf_invalidate_paths.extend(created_indexes)
+            # We will not invalidate the index files per cost consideration
+            # if cf_enable and len(created_indexes):
+            #     logger.debug("Add index files to cf invalidate list: %s", created_indexes)
+            #     cf_invalidate_paths.extend(created_indexes)
         else:
             logger.info("Bypassing indexing\n")
 

diff --git a/charon/pkgs/pkg_utils.py b/charon/pkgs/pkg_utils.py
@@ -87,11 +87,17 @@ def invalidate_cf_paths(
     logger.debug("Invalidating paths: %s", final_paths)
     if not domain:
         domain = cf_client.get_domain_by_bucket(bucket_name)
-    distr_id = cf_client.get_dist_id_by_domain(domain)
-    if distr_id:
-        result = cf_client.invalidate_paths(distr_id, final_paths)
-        if result:
-            logger.info(
-                "The CF invalidating request for metadata/indexing is sent, "
-                "request id %s, status is %s", result['Id'], result['Status']
-            )
+    if domain:
+        distr_id = cf_client.get_dist_id_by_domain(domain)
+        if distr_id:
+            result = cf_client.invalidate_paths(distr_id, final_paths)
+            if result:
+                logger.info(
+                    "The CF invalidating request for metadata/indexing is sent, "
+                    "request status as below:\n %s", result
+                )
+    else:
+        logger.error(
+            "CF invalidating will not be performed because domain not found for"
+            " bucket %s. ", bucket_name
+        )
diff --git a/tests/test_cf_reindex.py b/tests/test_cf_reindex.py
@@ -22,10 +22,12 @@
 from tests.constants import INPUTS
 from moto import mock_aws
 import os
+import pytest
 
 
 @mock_aws
 class CFReIndexTest(CFBasedTest):
+    @pytest.mark.skip(reason="Indexing CF invalidation is abandoned")
     def test_cf_maven_after_reindex(self):
         response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id)
         self.assertIsNotNone(response)
@@ -41,8 +43,7 @@ def test_cf_maven_after_reindex(self):
 
         re_index(
             (TEST_BUCKET, TEST_BUCKET, "ga", "", "maven.repository.redhat.com"),
-            "org/apache/httpcomponents/httpclient/", "maven",
-            cf_enable=True
+            "org/apache/httpcomponents/httpclient/", "maven"
         )
 
         response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id)
@@ -51,6 +52,7 @@ def test_cf_maven_after_reindex(self):
         self.assertEqual(1, len(items))
         self.assertEqual('completed', str.lower(items[0].get('Status')))
 
+    @pytest.mark.skip(reason="Indexing CF invalidation is abandoned")
     def test_cf_npm_after_reindex(self):
         response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id)
         self.assertIsNotNone(response)
@@ -66,8 +68,7 @@ def test_cf_npm_after_reindex(self):
 
         re_index(
             (TEST_BUCKET, TEST_BUCKET, "", "", "npm.registry.redhat.com"),
-            "@babel/", "npm",
-            cf_enable=True
+            "@babel/", "npm"
         )
 
         response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id)

diff --git a/tests/test_cfclient.py b/tests/test_cfclient.py
@@ -48,10 +48,10 @@ def test_get_distribution_id(self):
     def test_invalidate_paths(self):
         dist_id = self.cf_client.get_dist_id_by_domain("maven.repository.redhat.com")
         result = self.cf_client.invalidate_paths(dist_id, ["/*"])
-        self.assertIsNotNone(result['Id'])
-        self.assertEqual('completed', str.lower(result['Status']))
+        self.assertTrue(result[0]['Id'])
+        self.assertEqual('completed', str.lower(result[0]['Status']))
         status = self.cf_client.invalidate_paths("noexists_id", ["/*"])
-        self.assertIsNone(status)
+        self.assertFalse(status)
 
     @pytest.mark.skip(reason="""
         Because current moto 5.0.3 has not implemented the get_invalidation(),
@@ -60,6 +60,6 @@ def test_invalidate_paths(self):
     def test_check_invalidation(self):
         dist_id = self.cf_client.get_dist_id_by_domain("maven.repository.redhat.com")
         result = self.cf_client.invalidate_paths(dist_id, ["/*"])
-        invalidation = self.cf_client.check_invalidation(dist_id, result['Id'])
+        invalidation = self.cf_client.check_invalidation(dist_id, result[0]['Id'])
         self.assertIsNotNone(invalidation['Id'])
-        self.assertEqual('completed', str.lower(result['Status']))
+        self.assertEqual('completed', str.lower(result[0]['Status']))
diff --git a/tests/test_maven_index.py b/tests/test_maven_index.py
@@ -179,8 +179,7 @@ def test_re_index(self):
         )
         re_index(
             (TEST_BUCKET, TEST_BUCKET, "", "", None),
-            commons_client_root, "maven",
-            cf_enable=True
+            commons_client_root, "maven"
         )
         indedx_obj = test_bucket.Object(COMMONS_CLIENT_INDEX)
         index_content = str(indedx_obj.get()["Body"].read(), "utf-8")