Skip to content

Commit

Permalink
feat(minipipeline): introduce "classic" observations filtering (ooni#…
Browse files Browse the repository at this point in the history
…1402)

Classic filtering is an `WebObservationsContainer` filtering technique
that takes in input a `WebObservationsContainer` and only keeps DNS
lookups using `getaddrinfo` and endpoints whose IP address has been
discovered using `getaddrinfo`. By applying this technique, we reduce
the richer dataset produced by Web Connectivity LTE to a smaller dataset
comparable to what Web Connectivity v0.4 would return. In turn, by
focusing the analysis on the reduced dataset, we hope to emulate the
results produced by v0.4 for backward compatible test keys.

I named this feature "classic" because it's what we used to do and I
don't want to call it legacy.

Part of ooni/probe#2634.
  • Loading branch information
bassosimone authored and Murphy-OrangeMud committed Feb 13, 2024
1 parent 2fc7125 commit 23a6844
Show file tree
Hide file tree
Showing 74 changed files with 7,023 additions and 12 deletions.
14 changes: 13 additions & 1 deletion internal/cmd/minipipeline/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ func main() {
fmt.Fprintf(os.Stderr, "\n")
fmt.Fprintf(os.Stderr, "Analyzes the <file> provided using -measurement <file> and writes the\n")
fmt.Fprintf(os.Stderr, "observations.json and analysis.json files in the -destdir <dir> directory,\n")
fmt.Fprintf(os.Stderr, "which must already exist.\n")
fmt.Fprintf(os.Stderr, "which must already exist. Additionally, we also perform a \"classic\"\n")
fmt.Fprintf(os.Stderr, "analysis like the one in Web Connectivity v0.4 and generate accordingly the\n")
fmt.Fprintf(os.Stderr, "observations_classic.json and analysis_classic.json files.\n")
fmt.Fprintf(os.Stderr, "\n")
fmt.Fprintf(os.Stderr, "Use -prefix <prefix> to add <prefix> in front of the generated files names.\n")
fmt.Fprintf(os.Stderr, "\n")
Expand All @@ -58,8 +60,18 @@ func main() {
container := runtimex.Try1(minipipeline.IngestWebMeasurement(&parsed))
mustWriteFileFn(observationsPath, must.MarshalAndIndentJSON(container, "", " "), 0600)

// generate and write classic observations
classicObservationsPath := filepath.Join(*destdirFlag, *prefixFlag+"observations_classic.json")
containerClassic := minipipeline.ClassicFilter(container)
mustWriteFileFn(classicObservationsPath, must.MarshalAndIndentJSON(containerClassic, "", " "), 0600)

// generate and write observations analysis
analysisPath := filepath.Join(*destdirFlag, *prefixFlag+"analysis.json")
analysis := minipipeline.AnalyzeWebObservations(container)
mustWriteFileFn(analysisPath, must.MarshalAndIndentJSON(analysis, "", " "), 0600)

// generate and write the classic analysis
classicAnalysisPath := filepath.Join(*destdirFlag, *prefixFlag+"analysis_classic.json")
analysisClassic := minipipeline.AnalyzeWebObservations(containerClassic)
mustWriteFileFn(classicAnalysisPath, must.MarshalAndIndentJSON(analysisClassic, "", " "), 0600)
}
14 changes: 14 additions & 0 deletions internal/cmd/minipipeline/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,26 @@ func TestMainSuccess(t *testing.T) {
t.Fatal(diff)
}

// make sure the generated classic observations are good
expectedObservationsClassic := mustloadfile(filepath.Join("testdata", "observations_classic.json"))
gotObservationsClassic := mustloaddata(contentmap, filepath.Join("xo", "y-observations_classic.json"))
if diff := cmp.Diff(expectedObservationsClassic, gotObservationsClassic); diff != "" {
t.Fatal(diff)
}

// make sure the generated analysis is good
expectedAnalysis := mustloadfile(filepath.Join("testdata", "analysis.json"))
gotAnalysis := mustloaddata(contentmap, filepath.Join("xo", "y-analysis.json"))
if diff := cmp.Diff(expectedAnalysis, gotAnalysis); diff != "" {
t.Fatal(diff)
}

// make sure the generated classic analysis is good
expectedAnalysisClassic := mustloadfile(filepath.Join("testdata", "analysis_classic.json"))
gotAnalysisClassic := mustloaddata(contentmap, filepath.Join("xo", "y-analysis_classic.json"))
if diff := cmp.Diff(expectedAnalysisClassic, gotAnalysisClassic); diff != "" {
t.Fatal(diff)
}
}

func TestMainUsage(t *testing.T) {
Expand Down
25 changes: 25 additions & 0 deletions internal/cmd/minipipeline/testdata/analysis_classic.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"DNSExperimentFailure": null,
"DNSTransactionsWithBogons": {},
"DNSTransactionsWithUnexpectedFailures": null,
"DNSPossiblyInvalidAddrs": {},
"DNSPossiblyInvalidAddrsClassic": {},
"DNSPossiblyNonexistingDomains": null,
"HTTPDiffBodyProportionFactor": 1,
"HTTPDiffStatusCodeMatch": true,
"HTTPDiffTitleDifferentLongWords": {},
"HTTPDiffUncommonHeadersIntersection": {
"x-drupal-cache": true,
"x-generator": true
},
"HTTPFinalResponsesWithControl": {
"4": true
},
"HTTPFinalResponsesWithTLS": {
"4": true
},
"TCPTransactionsWithUnexpectedTCPConnectFailures": {},
"TCPTransactionsWithUnexpectedTLSHandshakeFailures": {},
"TCPTransactionsWithUnexpectedHTTPFailures": {},
"TCPTransactionsWithUnexplainedUnexpectedFailures": {}
}
111 changes: 111 additions & 0 deletions internal/cmd/minipipeline/testdata/observations_classic.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
{
"DNSLookupFailures": [],
"DNSLookupSuccesses": [
{
"DNSTransactionID": 1,
"DNSDomain": "nexa.polito.it",
"DNSLookupFailure": "",
"DNSQueryType": "ANY",
"DNSEngine": "getaddrinfo",
"IPAddress": "130.192.16.171",
"IPAddressASN": 137,
"IPAddressBogon": false,
"EndpointTransactionID": null,
"EndpointProto": null,
"EndpointPort": null,
"EndpointAddress": null,
"TCPConnectFailure": null,
"TLSHandshakeFailure": null,
"TLSServerName": null,
"HTTPRequestURL": null,
"HTTPFailure": null,
"HTTPResponseStatusCode": null,
"HTTPResponseBodyLength": null,
"HTTPResponseBodyIsTruncated": null,
"HTTPResponseHeadersKeys": null,
"HTTPResponseLocation": null,
"HTTPResponseTitle": null,
"HTTPResponseIsFinal": null,
"ControlDNSDomain": null,
"ControlDNSLookupFailure": null,
"ControlTCPConnectFailure": null,
"MatchWithControlIPAddress": null,
"MatchWithControlIPAddressASN": null,
"ControlTLSHandshakeFailure": null,
"ControlHTTPFailure": null,
"ControlHTTPResponseStatusCode": null,
"ControlHTTPResponseBodyLength": null,
"ControlHTTPResponseHeadersKeys": null,
"ControlHTTPResponseTitle": null
}
],
"KnownTCPEndpoints": {
"4": {
"DNSTransactionID": 3,
"DNSDomain": "nexa.polito.it",
"DNSLookupFailure": "",
"DNSQueryType": null,
"DNSEngine": null,
"IPAddress": "130.192.16.171",
"IPAddressASN": 137,
"IPAddressBogon": false,
"EndpointTransactionID": 4,
"EndpointProto": "tcp",
"EndpointPort": "443",
"EndpointAddress": "130.192.16.171:443",
"TCPConnectFailure": "",
"TLSHandshakeFailure": "",
"TLSServerName": "nexa.polito.it",
"HTTPRequestURL": "https://nexa.polito.it/",
"HTTPFailure": "",
"HTTPResponseStatusCode": 200,
"HTTPResponseBodyLength": 36564,
"HTTPResponseBodyIsTruncated": false,
"HTTPResponseHeadersKeys": {
"Cache-Control": true,
"Content-Language": true,
"Content-Type": true,
"Date": true,
"Etag": true,
"Expires": true,
"Last-Modified": true,
"Link": true,
"Server": true,
"Vary": true,
"X-Content-Type-Options": true,
"X-Drupal-Cache": true,
"X-Frame-Options": true,
"X-Generator": true
},
"HTTPResponseLocation": null,
"HTTPResponseTitle": "Nexa Center for Internet \u0026 Society | Il centro Nexa è un centro di ricerca del Dipartimento di Automatica e Informatica del Politecnico di Torino",
"HTTPResponseIsFinal": true,
"ControlDNSDomain": null,
"ControlDNSLookupFailure": null,
"ControlTCPConnectFailure": "",
"MatchWithControlIPAddress": true,
"MatchWithControlIPAddressASN": true,
"ControlTLSHandshakeFailure": "",
"ControlHTTPFailure": "",
"ControlHTTPResponseStatusCode": 200,
"ControlHTTPResponseBodyLength": 36564,
"ControlHTTPResponseHeadersKeys": {
"Cache-Control": true,
"Content-Language": true,
"Content-Type": true,
"Date": true,
"Etag": true,
"Expires": true,
"Last-Modified": true,
"Link": true,
"Server": true,
"Vary": true,
"X-Content-Type-Options": true,
"X-Drupal-Cache": true,
"X-Frame-Options": true,
"X-Generator": true
},
"ControlHTTPResponseTitle": "Nexa Center for Internet \u0026 Society | Il centro Nexa è un centro di ricerca del Dipartimento di Automatica e Informatica del Politecnico di Torino"
}
}
}
13 changes: 11 additions & 2 deletions internal/cmd/qatool/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,23 @@ func runWebConnectivityLTE(tc *webconnectivityqa.TestCase) {
// serialize the observations
mustSerializeMkdirAllAndWriteFile(actualDestdir, "observations.json", observationsContainer)

// convert to classic observations
observationsContainerClassic := minipipeline.ClassicFilter(observationsContainer)

// serialize the classic observations
mustSerializeMkdirAllAndWriteFile(actualDestdir, "observations_classic.json", observationsContainerClassic)

// analyze the observations
analysis := minipipeline.AnalyzeWebObservations(observationsContainer)

// serialize the observations analysis
mustSerializeMkdirAllAndWriteFile(actualDestdir, "analysis.json", analysis)

// print the analysis to stdout
fmt.Printf("%s\n", must.MarshalAndIndentJSON(analysis, "", " "))
// perform the classic analysis
analysisClassic := minipipeline.AnalyzeWebObservations(observationsContainerClassic)

// serialize the classic analysis results
mustSerializeMkdirAllAndWriteFile(actualDestdir, "analysis_classic.json", analysisClassic)
}
}

Expand Down
26 changes: 17 additions & 9 deletions internal/cmd/qatool/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,23 @@ func TestMainSuccess(t *testing.T) {

// make sure we attempted to write the desired files
expect := map[string]bool{
"xo/dnsBlockingAndroidDNSCacheNoData/measurement.json": true,
"xo/dnsBlockingAndroidDNSCacheNoData/observations.json": true,
"xo/dnsBlockingAndroidDNSCacheNoData/analysis.json": true,
"xo/dnsBlockingBOGON/analysis.json": true,
"xo/dnsBlockingBOGON/measurement.json": true,
"xo/dnsBlockingBOGON/observations.json": true,
"xo/dnsBlockingNXDOMAIN/measurement.json": true,
"xo/dnsBlockingNXDOMAIN/observations.json": true,
"xo/dnsBlockingNXDOMAIN/analysis.json": true,
"xo/dnsBlockingAndroidDNSCacheNoData/measurement.json": true,
"xo/dnsBlockingAndroidDNSCacheNoData/observations.json": true,
"xo/dnsBlockingAndroidDNSCacheNoData/observations_classic.json": true,
"xo/dnsBlockingAndroidDNSCacheNoData/analysis.json": true,
"xo/dnsBlockingAndroidDNSCacheNoData/analysis_classic.json": true,

"xo/dnsBlockingBOGON/analysis.json": true,
"xo/dnsBlockingBOGON/analysis_classic.json": true,
"xo/dnsBlockingBOGON/measurement.json": true,
"xo/dnsBlockingBOGON/observations.json": true,
"xo/dnsBlockingBOGON/observations_classic.json": true,

"xo/dnsBlockingNXDOMAIN/measurement.json": true,
"xo/dnsBlockingNXDOMAIN/observations.json": true,
"xo/dnsBlockingNXDOMAIN/observations_classic.json": true,
"xo/dnsBlockingNXDOMAIN/analysis.json": true,
"xo/dnsBlockingNXDOMAIN/analysis_classic.json": true,
}
got := make(map[string]bool)
for key := range contentmap {
Expand Down
53 changes: 53 additions & 0 deletions internal/minipipeline/classic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package minipipeline

// ClassicFilter takes in input a [*WebObservationsContainer] and returns in output
// another [*WebObservationsContainer] where we only keep:
//
// 1. DNS lookups using getaddrinfo;
//
// 2. IP addresses discovered using getaddrinfo;
//
// 3. endpoints using such IP addresses.
//
// We use this filter to produce a backward compatible Web Connectivity analysis
// when the input [*WebObservationsContainer] was built using LTE.
//
// The result should approximate what v0.4 would have measured.
func ClassicFilter(input *WebObservationsContainer) (output *WebObservationsContainer) {
output = &WebObservationsContainer{
DNSLookupFailures: []*WebObservation{},
DNSLookupSuccesses: []*WebObservation{},
KnownTCPEndpoints: map[int64]*WebObservation{},
knownIPAddresses: map[string]*WebObservation{},
}

// DNSLookupFailures
for _, entry := range input.DNSLookupFailures {
if !utilsEngineIsGetaddrinfo(entry.DNSEngine) {
continue
}
output.DNSLookupFailures = append(output.DNSLookupFailures, entry)
}

// DNSLookupSuccesses & knownIPAddresses
for _, entry := range input.DNSLookupSuccesses {
if !utilsEngineIsGetaddrinfo(entry.DNSEngine) {
continue
}
ipAddr := entry.IPAddress.Unwrap() // it MUST be there
output.DNSLookupSuccesses = append(output.DNSLookupSuccesses, entry)
output.knownIPAddresses[ipAddr] = entry
}

// KnownTCPEndpoints
for _, entry := range input.KnownTCPEndpoints {
ipAddr := entry.IPAddress.Unwrap() // it MUST be there
txid := entry.EndpointTransactionID.Unwrap()
if output.knownIPAddresses[ipAddr] == nil {
continue
}
output.KnownTCPEndpoints[txid] = entry
}

return
}
30 changes: 30 additions & 0 deletions internal/minipipeline/qa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,32 +42,62 @@ func testMustRunAllWebTestCases(t *testing.T, topdir string) {
var expectedContainerData minipipeline.WebObservationsContainer
must.UnmarshalJSON(expectedContainerRaw, &expectedContainerData)

// load the expected classic container from the test case
expectedClassicContainerFile := filepath.Join(fullpath, "observations_classic.json")
expectedClassicContainerRaw := must.ReadFile(expectedClassicContainerFile)
var expectedClassicContainerData minipipeline.WebObservationsContainer
must.UnmarshalJSON(expectedClassicContainerRaw, &expectedClassicContainerData)

// load the expected analysis from the test case
expectedAnalysisFile := filepath.Join(fullpath, "analysis.json")
expectedAnalysisRaw := must.ReadFile(expectedAnalysisFile)
var expectedAnalysisData minipipeline.WebAnalysis
must.UnmarshalJSON(expectedAnalysisRaw, &expectedAnalysisData)

// load the expected classic analysis from the test case
expectedClassicAnalysisFile := filepath.Join(fullpath, "analysis_classic.json")
expectedClassicAnalysisRaw := must.ReadFile(expectedClassicAnalysisFile)
var expectedClassicAnalysisData minipipeline.WebAnalysis
must.UnmarshalJSON(expectedClassicAnalysisRaw, &expectedClassicAnalysisData)

// load the measurement into the pipeline
gotContainerData, err := minipipeline.IngestWebMeasurement(&measurementData)
if err != nil {
t.Fatal(err)
}

// convert the container into a classic container
gotClassicContainerData := minipipeline.ClassicFilter(gotContainerData)

// analyze the measurement
gotAnalysisData := minipipeline.AnalyzeWebObservations(gotContainerData)

// perform the classic web-connectivity-v0.4-like analysis
gotClassicAnalysisData := minipipeline.AnalyzeWebObservations(gotClassicContainerData)

t.Run("observations", func(t *testing.T) {
if diff := testCmpDiffUsingGenericMaps(&expectedContainerData, gotContainerData); diff != "" {
t.Fatal(diff)
}
})

t.Run("observations_classic", func(t *testing.T) {
if diff := testCmpDiffUsingGenericMaps(&expectedClassicContainerData, gotClassicContainerData); diff != "" {
t.Fatal(diff)
}
})

t.Run("analysis", func(t *testing.T) {
if diff := testCmpDiffUsingGenericMaps(&expectedAnalysisData, gotAnalysisData); diff != "" {
t.Fatal(diff)
}
})

t.Run("analysis_classic", func(t *testing.T) {
if diff := testCmpDiffUsingGenericMaps(&expectedClassicAnalysisData, gotClassicAnalysisData); diff != "" {
t.Fatal(diff)
}
})
})
}
})
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"DNSExperimentFailure": null,
"DNSTransactionsWithBogons": {},
"DNSTransactionsWithUnexpectedFailures": null,
"DNSPossiblyInvalidAddrs": {},
"DNSPossiblyInvalidAddrsClassic": {},
"DNSPossiblyNonexistingDomains": null,
"HTTPDiffBodyProportionFactor": null,
"HTTPDiffStatusCodeMatch": null,
"HTTPDiffTitleDifferentLongWords": null,
"HTTPDiffUncommonHeadersIntersection": null,
"HTTPFinalResponsesWithControl": null,
"HTTPFinalResponsesWithTLS": null,
"TCPTransactionsWithUnexpectedTCPConnectFailures": {},
"TCPTransactionsWithUnexpectedTLSHandshakeFailures": {},
"TCPTransactionsWithUnexpectedHTTPFailures": null,
"TCPTransactionsWithUnexplainedUnexpectedFailures": null
}
Loading

0 comments on commit 23a6844

Please sign in to comment.