From 4f0ff5042215d7afe870318d7577490dc5607cd4 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Sat, 26 Oct 2024 22:44:32 +0200 Subject: [PATCH] working on the testcases and some bugfixes on the proxy --- ontologytimemachine/custom_proxy.py | 29 ++++++++++--- ontologytimemachine/utils/config.py | 4 +- ontologytimemachine/utils/proxy_logic.py | 53 +++++++++++++----------- tests/archivo_test_IRIs.tsv | 30 +++++++------- tests/non_archivo_test_IRIs.tsv | 24 +++++------ tests/test_proxy_auth_header.py | 45 +++++++++++--------- 6 files changed, 107 insertions(+), 78 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 4a12048..a685474 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -42,11 +42,11 @@ def __init__(self, *args, **kwargs): def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: # self.client.config = QUOTE_NONE - logger.info("Before upstream connection hook") + logger.info("Before upstcream connection hook") logger.info(f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}") wrapped_request = HttpRequestWrapper(request) - if (self.config.clientConfigViaProxyAuth == ClientConfigViaProxyAuth.REQUIRED or self.config.clientConfigViaProxyAuth == ClientConfigViaProxyAuth.OPTIONAL) and not wrapped_request.is_connect_request(): + if (self.config.clientConfigViaProxyAuth == ClientConfigViaProxyAuth.REQUIRED or self.config.clientConfigViaProxyAuth == ClientConfigViaProxyAuth.OPTIONAL): logger.info('Setting up config from auth') config_from_auth = evaluate_configuration(wrapped_request, self.config) if (not config_from_auth and self.config.clientConfigViaProxyAuth == ClientConfigViaProxyAuth.REQUIRED): @@ -68,10 +68,13 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: config = self.client.config else: logger.info("Using the proxy configuration") - config = self.config - + config = self.config + if wrapped_request.is_connect_request(): logger.info(f"Handling CONNECT request: configured HTTPS interception mode: {config.httpsInterception}") + # Mark if there is a connect request + if not hasattr(self.client, "mark_connect"): + self.client.mark_connect = True # Check whether to allow CONNECT requests since they can impose a security risk if not do_block_CONNECT_request(config): @@ -82,7 +85,7 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: return None response = get_response_from_request(wrapped_request, config) - if response: + if response.status_code: logger.info(response.status_code) self.queue_response(response) return None @@ -98,6 +101,7 @@ def do_intercept(self, _request: HttpParser) -> bool: if hasattr(self.client, "config"): logger.info("Using the configuration from the Auth") config = self.client.config + logger.info(f'Config: {config}') else: logger.info("Using the proxy configuration") config = self.config @@ -129,6 +133,21 @@ def handle_client_request(self, request: HttpParser) -> HttpParser: logger.info("Handle client request hook") logger.info(f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}") + wrapped_request = HttpRequestWrapper(request) + if (wrapped_request.is_head_request() or wrapped_request.is_get_request()) and hasattr(self.client, "mark_connect"): + if self.client.mark_connect: + if hasattr(self.client, "config"): + logger.info("Using the configuration from the Auth") + config = self.client.config + else: + logger.info("Using the proxy configuration") + config = self.config + response = get_response_from_request(wrapped_request, config) + if response.status_code: + logger.info(response.status_code) + self.queue_response(response) + return None + return request def handle_upstream_chunk(self, chunk: memoryview): diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py index 680d4ca..49dc39d 100644 --- a/ontologytimemachine/utils/config.py +++ b/ontologytimemachine/utils/config.py @@ -61,7 +61,7 @@ class OntoVersion(EnumValuePrint): ORIGINAL_FAILOVER_LIVE_LATEST = "originalFailoverLiveLatest" LATEST_ARCHIVED = "latestArchived" TIMESTAMP_ARCHIVED = "timestampArchived" - DEPENDENCY_MANIFEST = "dependencyManifest" + #DEPENDENCY_MANIFEST = "dependencyManifest" class HttpsInterception(EnumValuePrint): @@ -91,7 +91,7 @@ class Config: ontoFormatConf: OntoFormatConfig = field(default_factory=OntoFormatConfig) ontoVersion: OntoVersion = OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST restrictedAccess: bool = False - clientConfigViaProxyAuth: ClientConfigViaProxyAuth = ClientConfigViaProxyAuth.IGNORE + clientConfigViaProxyAuth: ClientConfigViaProxyAuth = ClientConfigViaProxyAuth.REQUIRED httpsInterception: HttpsInterception = HttpsInterception.ALL disableRemovingRedirects: bool = False timestamp: str = "" diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index a4b5b0d..af22f4b 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -45,6 +45,7 @@ def do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config): def get_response_from_request(wrapped_request, config): + logger.info('Ger response from tequest') do_deny = do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config) if do_deny: logger.warning( @@ -148,16 +149,14 @@ def request_ontology( ): allow_redirects = not disableRemovingRedirects try: - logger.info(headers) - logger.info(allow_redirects) if wrapped_request.is_head_request(): - response = requests.head( - url=url, headers=headers, allow_redirects=allow_redirects, timeout=3 - ) + response = requests.head(url=url, headers=headers, allow_redirects=allow_redirects, timeout=3) + logger.info(response.content) + logger.info(response.status_code) else: - response = requests.get( - url=url, headers=headers, allow_redirects=allow_redirects, timeout=3 - ) + response = requests.get(url=url, headers=headers, allow_redirects=allow_redirects, timeout=3) + logger.info(response.content) + logger.info(response.status_code) logger.info("Successfully fetched ontology") return response except Exception as e: @@ -192,7 +191,7 @@ def proxy_logic(wrapped_request, config): ) elif config.ontoVersion == OntoVersion.LATEST_ARCHIVED: logger.info('OntoVersion LATEST_ARCHIVED') - response = fetch_latest_archived(wrapped_request, ontology, headers) + response = fetch_latest_archived(wrapped_request, headers) elif config.ontoVersion == OntoVersion.TIMESTAMP_ARCHIVED: logger.info('OntoVersion TIMESTAMP_ARCHIVED') response = fetch_timestamp_archived(wrapped_request, headers, config) @@ -218,25 +217,29 @@ def fetch_failover(wrapped_request, headers, disableRemovingRedirects): original_response = request_ontology( wrapped_request, ontology, headers, disableRemovingRedirects ) - if original_response.status_code in passthrough_status_codes: - requested_mimetypes_with_priority = parse_accept_header_with_priority( - headers["Accept"] - ) - requested_mimetypes = [x[0] for x in requested_mimetypes_with_priority] - response_mime_type = original_response.headers.get("Content-Type", ";").split( - ";" - )[0] - logger.info(f"Requested mimetypes: {requested_mimetypes}") - logger.info(f"Response mimetype: {response_mime_type}") - if response_mime_type in requested_mimetypes: - return original_response + logger.info(f'Original response: {original_response}') + if original_response: + logger.info('Got an original response') + if original_response.status_code in passthrough_status_codes: + requested_mimetypes_with_priority = parse_accept_header_with_priority( + headers["Accept"] + ) + requested_mimetypes = [x[0] for x in requested_mimetypes_with_priority] + response_mime_type = original_response.headers.get("Content-Type", ";").split( + ";" + )[0] + logger.info(f"Requested mimetypes: {requested_mimetypes}") + logger.info(f"Response mimetype: {response_mime_type}") + if response_mime_type in requested_mimetypes: + return original_response + else: + logger.info(f"The returned type is not the same as the requested one") + return fetch_latest_archived(wrapped_request, headers) else: - logger.info(f"The returned type is not the same as the requested one") + logger.info(f"The returend status code is not accepted: {original_response.status_code}") return fetch_latest_archived(wrapped_request, headers) else: - logger.info( - f"The returend status code is not accepted: {original_response.status_code}" - ) + logger.info("No original response") return fetch_latest_archived(wrapped_request, headers) diff --git a/tests/archivo_test_IRIs.tsv b/tests/archivo_test_IRIs.tsv index c7f8507..17c7908 100644 --- a/tests/archivo_test_IRIs.tsv +++ b/tests/archivo_test_IRIs.tsv @@ -1,22 +1,22 @@ enable_testcase iri error_dimension expected_error iri_type comment -0 http://buzzword.org.uk/rdf/personal-link-types# content text/html hash weird html instead of text/turtle -0 http://data.finlex.fi/schema/sfl/ content 0-bytes slash 0 bytes content-length -0 http://data.bigdatagrapes.eu/resource/ontology/ dns nxdomain slash -0 http://data.bigdatagrapes.eu/resource/ontology/MeasurementContext dns nxdomain term -0 http://data.ontotext.com/resource/leak/ http-code 502 slash -0 http://data.europa.eu/esco/flow http-code 406 slash -0 http://bdi.si.ehu.es/bdi/ontologies/ExtruOnt/ExtruOnt transport connect-timeout slash -0 http://catalogus-professorum.org/cpm/2/ transport connection-refused slash -0 http://www.w3.org/1999/02/22-rdf-syntax-ns# None hash -0 http://xmlns.com/foaf/0.1/ None slash -0 http://xmlns.com/foaf/0.1/Person None term -0 http://dbpedia.org/ontology/ None term -0 http://dbpedia.org/ontology/Person None term -1 https://bag2.basisregistraties.overheid.nl/bag/def/ http-code 404 slash +1 http://buzzword.org.uk/rdf/personal-link-types# content text/html hash weird html instead of text/turtle +1 http://data.finlex.fi/schema/sfl/ content 0-bytes slash 0 bytes content-length +1 http://data.bigdatagrapes.eu/resource/ontology/ dns nxdomain slash +1 http://data.bigdatagrapes.eu/resource/ontology/MeasurementContext dns nxdomain term +1 http://data.ontotext.com/resource/leak/ http-code 502 slash +1 http://data.europa.eu/esco/flow http-code 406 slash +1 http://bdi.si.ehu.es/bdi/ontologies/ExtruOnt/ExtruOnt transport connect-timeout slash +1 http://catalogus-professorum.org/cpm/2/ transport connection-refused slash +1 http://www.w3.org/1999/02/22-rdf-syntax-ns# None hash +1 http://xmlns.com/foaf/0.1/ None slash +1 http://xmlns.com/foaf/0.1/Person None term +1 http://dbpedia.org/ontology/ None term +1 http://dbpedia.org/ontology/Person None term +0 https://bag2.basisregistraties.overheid.nl/bag/def/ http-code 404 slash 0 https://bag2.basisregistraties.overheid.nl/bag/def/Gebruiksdoel http-code 404 term 0 https://id.parliament.uk/schema http-code 404 slash slash onto without trailing slash / 0 https://id.parliament.uk/schema/Approval http-code 404 term slash onto without trailing slash / 0 https://bmake.th-brandenburg.de/spv# http-code 403 hash 0 https://bmake.th-brandenburg.de/spv http-code 403 hash just test whether Archivo API is used correctly 0 https://w3id.org/ttla/ transport cert-expired hash -0 http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf transport connection-refused hash +1 http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf transport connection-refused hash diff --git a/tests/non_archivo_test_IRIs.tsv b/tests/non_archivo_test_IRIs.tsv index 86013ac..627c6bf 100644 --- a/tests/non_archivo_test_IRIs.tsv +++ b/tests/non_archivo_test_IRIs.tsv @@ -1,12 +1,12 @@ -iri error_dimension expected_error iri_type comment -https://data.ontotext.com/resource/leak/ http-code 401 https is not ID -https://www.w3.org/1999/02/22-rdf-syntax-ns# None https is not ID -http://example.org None -https://example.org None -http://1.1.1.1 None -https://1.1.1.1 None -https://data.globalchange.gov/gcis.owl http-code "403 " https is not ID -https://data.ordnancesurvey.co.uk/ontology/geometry/ http-code 404 https is not ID -https://data.ordnancesurvey.co.uk/ontology/ http-code 301 https is not ID -https://google.com None - +enable_testcase iri error_dimension expected_error iri_type comment +0 https://data.ontotext.com/resource/leak/ http-code 401 https is not ID +0 https://www.w3.org/1999/02/22-rdf-syntax-ns# None https is not ID +0 http://example.org None +0 https://example.org None +0 http://1.1.1.1 None +0 https://1.1.1.1 None +0 https://data.globalchange.gov/gcis.owl http-code 403 https is not ID +0 https://data.ordnancesurvey.co.uk/ontology/geometry/ http-code 404 https is not ID +0 https://data.ordnancesurvey.co.uk/ontology/ http-code 301 https is not ID +0 https://google.com None +0 \ No newline at end of file diff --git a/tests/test_proxy_auth_header.py b/tests/test_proxy_auth_header.py index 57c7ef3..21ee6e6 100644 --- a/tests/test_proxy_auth_header.py +++ b/tests/test_proxy_auth_header.py @@ -14,6 +14,7 @@ HTTP_PROXY = f"http://{PROXY}" HTTPS_PROXY = f"http://{PROXY}" PROXIES = {"http": HTTP_PROXY, "https": HTTPS_PROXY} +CA_CERT_PATH = "ca-cert.pem" logging.basicConfig( level=logging.ERROR, @@ -37,7 +38,9 @@ def create_fake_response(status_code='error'): def make_request_without_proxy(iri: str) -> Tuple[int, str]: """Make a direct request to the IRI without using the proxy.""" - headers = {} + headers = { + "Accept": "text/turtle" + } try: response = requests.get(iri, timeout=10, headers=headers, allow_redirects=True) return response @@ -73,11 +76,13 @@ def make_request_with_proxy(iri: str, mode: str) -> Tuple[int, str]: username = f"--ontoVersion {mode}" password = "my_password" headers = { + "Accept": "text/turtle", "Accept-Encoding": "identity", "Proxy-Authorization": _basic_auth_str(username, password) } try: - response = requests.get(iri, proxies=PROXIES, headers=headers, timeout=10) + # There is an issue here for https requests + response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH, headers=headers, timeout=10) return response except SSLError as e: mock_response = Mock() @@ -103,7 +108,7 @@ def make_request_with_proxy(iri: str, mode: str) -> Tuple[int, str]: else: mock_response = Mock() mock_response.content = '' - mock_response.status_code = 406 + mock_response.status_code = 'error' return mock_response except Exception as e: mock_response = Mock() @@ -125,48 +130,50 @@ def test_proxy_responses(test_case): if enabled == '1': # Make direct and proxy requests direct_response = make_request_without_proxy(iri) - logger.info(direct_response) - proxy_response = make_request_with_proxy(iri, 'original') - #proxy_response = make_request_with_proxy(iri, 'original') - #proxy_response = make_request_with_proxy(iri, 'laters') - #proxy_response = make_request_with_proxy(iri, 'original') - + proxy_original_response = make_request_with_proxy(iri, 'original') + proxy_failover_response = make_request_with_proxy(iri, 'originalFailoverLiveLatest') + proxy_archivo_laest_response = make_request_with_proxy(iri, 'latestArchived') + # Evaluation based on error_dimension if error_dimension == 'http-code': assert int(expected_error) == direct_response.status_code - assert int(expected_error) == proxy_response.status_code + assert int(expected_error) == proxy_original_response.status_code + elif error_dimension == 'None': assert direct_response.status_code == 200 - assert proxy_response.status_code == 200 + assert proxy_original_response.status_code == 200 elif error_dimension == 'content': if expected_error == 'text_html': assert direct_response.headers.get('Content-Type') == 'text/html' - assert proxy_response.headers.get('Content-Type') == 'text/html' + assert proxy_original_response.headers.get('Content-Type') == 'text/html' elif expected_error == '0-bytes': assert len(direct_response.content) == 0 - assert len(proxy_response.content) == 0 + assert len(proxy_original_response.content) == 0 elif error_dimension == 'dns': if expected_error == 'nxdomain': assert direct_response.status_code == 'nxdomain-error' - assert proxy_response.status_code == 502 + assert proxy_original_response.status_code == 502 elif error_dimension == 'transport': if expected_error == 'cert-expired': assert direct_response.status_code == 'ssl-error' - assert proxy_response.status_code == 'ssl-error' + assert proxy_original_response.status_code == 'ssl-error' elif expected_error == 'connect-timeout': assert direct_response.status_code == 'timeout-error' - assert proxy_response.status_code == 'timeout-error' + assert proxy_original_response.status_code == 'timeout-error' elif expected_error == 'connect-refused': assert direct_response.status_code == 'connection-refused-error' - assert proxy_response.status_code == 'connection-refused-error' + assert proxy_original_response.status_code == 'connection-refused-error' - else: - assert True == True + assert 200 == proxy_failover_response.status_code + assert 200 == proxy_archivo_laest_response.status_code + else: + assert True + if __name__ == "__main__":