From 1284a0d3333ddef8ddd536cc6a9fc1a9d833e554 Mon Sep 17 00:00:00 2001 From: Ben Dean-Kawamura Date: Tue, 14 Jan 2025 16:25:11 -0500 Subject: [PATCH] AMP FTS experiment logic This sets it up so we always ingest the FTS data and use the `SuggestionProviderConstraints` passed to the query to determine how to perform the query. This seems like the simplest approach and it doesn't increase ingestion time that much. The benchmarks on my machine went from 339.29 ms to 465.60 ms. --- components/suggest/src/db.rs | 198 +++++++++++++++++++++++++---- components/suggest/src/lib.rs | 2 +- components/suggest/src/provider.rs | 17 +++ components/suggest/src/query.rs | 1 + components/suggest/src/rs.rs | 19 ++- components/suggest/src/schema.rs | 27 +++- components/suggest/src/store.rs | 133 ++++++++++++++++++- examples/suggest-cli/src/main.rs | 41 +++++- 8 files changed, 408 insertions(+), 30 deletions(-) diff --git a/components/suggest/src/db.rs b/components/suggest/src/db.rs index fbeda64c65..7a7f857433 100644 --- a/components/suggest/src/db.rs +++ b/components/suggest/src/db.rs @@ -21,7 +21,7 @@ use crate::{ fakespot, geoname::GeonameCache, pocket::{split_keyword, KeywordConfidence}, - provider::SuggestionProvider, + provider::{AmpMatchingStrategy, SuggestionProvider}, query::FtsQuery, rs::{ DownloadedAmoSuggestion, DownloadedAmpSuggestion, DownloadedAmpWikipediaSuggestion, @@ -321,35 +321,67 @@ impl<'a> SuggestDao<'a> { &self, query: &SuggestionQuery, suggestion_type: AmpSuggestionType, + ) -> Result> { + let strategy = query + .provider_constraints + .as_ref() + .and_then(|c| c.amp_alternative_matching.as_ref()); + match strategy { + None => self.fetch_amp_suggestions_using_keywords(query, suggestion_type, true), + Some(AmpMatchingStrategy::NoKeywordExpansion) => { + self.fetch_amp_suggestions_using_keywords(query, suggestion_type, false) + } + Some(AmpMatchingStrategy::FtsAgainstFullKeywords) => { + self.fetch_amp_suggestions_using_fts(query, suggestion_type, "full_keywords") + } + Some(AmpMatchingStrategy::FtsAgainstTitle) => { + self.fetch_amp_suggestions_using_fts(query, suggestion_type, "title") + } + } + } + + pub fn fetch_amp_suggestions_using_keywords( + &self, + query: &SuggestionQuery, + suggestion_type: AmpSuggestionType, + allow_keyword_expansion: bool, ) -> Result> { let keyword_lowercased = &query.keyword.to_lowercase(); let provider = match suggestion_type { AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile, AmpSuggestionType::Desktop => SuggestionProvider::Amp, }; + let where_extra = if allow_keyword_expansion { + "" + } else { + "AND INSTR(CONCAT(fk.full_keyword, ' '), k.keyword) != 0" + }; let suggestions = self.conn.query_rows_and_then_cached( - r#" - SELECT - s.id, - k.rank, - s.title, - s.url, - s.provider, - s.score, - fk.full_keyword - FROM - suggestions s - JOIN - keywords k - ON k.suggestion_id = s.id - LEFT JOIN - full_keywords fk - ON k.full_keyword_id = fk.id - WHERE - s.provider = :provider - AND k.keyword = :keyword - AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) - "#, + &format!( + r#" + SELECT + s.id, + k.rank, + s.title, + s.url, + s.provider, + s.score, + fk.full_keyword + FROM + suggestions s + JOIN + keywords k + ON k.suggestion_id = s.id + LEFT JOIN + full_keywords fk + ON k.full_keyword_id = fk.id + WHERE + s.provider = :provider + AND k.keyword = :keyword + {where_extra} + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) + "# + ), named_params! { ":keyword": keyword_lowercased, ":provider": provider @@ -427,6 +459,96 @@ impl<'a> SuggestDao<'a> { Ok(suggestions) } + pub fn fetch_amp_suggestions_using_fts( + &self, + query: &SuggestionQuery, + suggestion_type: AmpSuggestionType, + fts_column: &str, + ) -> Result> { + let fts_query = query.fts_query(); + let match_arg = &fts_query.match_arg; + let provider = match suggestion_type { + AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile, + AmpSuggestionType::Desktop => SuggestionProvider::Amp, + }; + let suggestions = self.conn.query_rows_and_then_cached( + &format!( + r#" + SELECT + s.id, + s.title, + s.url, + s.provider, + s.score + FROM + suggestions s + JOIN + amp_fts fts + ON fts.rowid = s.id + WHERE + s.provider = :provider + AND amp_fts match '{fts_column}: {match_arg}' + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) + ORDER BY rank + LIMIT 1 + "# + ), + named_params! { + ":provider": provider + }, + |row| -> Result { + let suggestion_id: i64 = row.get("id")?; + let title = row.get("title")?; + let raw_url: String = row.get("url")?; + let score: f64 = row.get("score")?; + + self.conn.query_row_and_then( + r#" + SELECT + amp.advertiser, + amp.block_id, + amp.iab_category, + amp.impression_url, + amp.click_url, + i.data AS icon, + i.mimetype AS icon_mimetype + FROM + amp_custom_details amp + LEFT JOIN + icons i ON amp.icon_id = i.id + WHERE + amp.suggestion_id = :suggestion_id + "#, + named_params! { + ":suggestion_id": suggestion_id + }, + |row| { + let cooked_url = cook_raw_suggestion_url(&raw_url); + let raw_click_url = row.get::<_, String>("click_url")?; + let cooked_click_url = cook_raw_suggestion_url(&raw_click_url); + + Ok(Suggestion::Amp { + block_id: row.get("block_id")?, + advertiser: row.get("advertiser")?, + iab_category: row.get("iab_category")?, + title, + url: cooked_url, + raw_url, + full_keyword: query.keyword.clone(), + icon: row.get("icon")?, + icon_mimetype: row.get("icon_mimetype")?, + impression_url: row.get("impression_url")?, + click_url: cooked_click_url, + raw_click_url, + score, + }) + }, + ) + }, + )?; + Ok(suggestions) + } + /// Fetches Suggestions of type Wikipedia provider that match the given query pub fn fetch_wikipedia_suggestions(&self, query: &SuggestionQuery) -> Result> { let keyword_lowercased = &query.keyword.to_lowercase(); @@ -954,6 +1076,7 @@ impl<'a> SuggestDao<'a> { let mut amp_insert = AmpInsertStatement::new(self.conn)?; let mut wiki_insert = WikipediaInsertStatement::new(self.conn)?; let mut keyword_insert = KeywordInsertStatement::new(self.conn)?; + let mut fts_insert = AmpFtsInsertStatement::new(self.conn)?; for suggestion in suggestions { self.scope.err_if_interrupted()?; let common_details = suggestion.common_details(); @@ -974,6 +1097,11 @@ impl<'a> SuggestDao<'a> { wiki_insert.execute(suggestion_id, wikipedia)?; } } + fts_insert.execute( + suggestion_id, + &common_details.full_keywords_joined(), + &common_details.title, + )?; let mut full_keyword_inserter = FullKeywordInserter::new(self.conn, suggestion_id); for keyword in common_details.keywords() { let full_keyword_id = match (suggestion, keyword.full_keyword) { @@ -1764,6 +1892,30 @@ impl<'conn> KeywordMetricsInsertStatement<'conn> { } } +pub(crate) struct AmpFtsInsertStatement<'conn>(rusqlite::Statement<'conn>); + +impl<'conn> AmpFtsInsertStatement<'conn> { + pub(crate) fn new(conn: &'conn Connection) -> Result { + Ok(Self(conn.prepare( + "INSERT INTO amp_fts(rowid, full_keywords, title) + VALUES(?, ?, ?) + ", + )?)) + } + + pub(crate) fn execute( + &mut self, + suggestion_id: i64, + full_keywords: &str, + title: &str, + ) -> Result<()> { + self.0 + .execute((suggestion_id, full_keywords, title)) + .with_context("amp fts insert")?; + Ok(()) + } +} + fn provider_config_meta_key(provider: SuggestionProvider) -> String { format!("{}{}", PROVIDER_CONFIG_META_KEY_PREFIX, provider as u8) } diff --git a/components/suggest/src/lib.rs b/components/suggest/src/lib.rs index 689f39a57b..7c64b7c6ca 100644 --- a/components/suggest/src/lib.rs +++ b/components/suggest/src/lib.rs @@ -28,7 +28,7 @@ pub use config::{SuggestGlobalConfig, SuggestProviderConfig}; pub use error::{Error, SuggestApiError}; pub use geoname::{Geoname, GeonameMatch, GeonameType}; pub use metrics::{LabeledTimingSample, SuggestIngestionMetrics}; -pub use provider::{SuggestionProvider, SuggestionProviderConstraints}; +pub use provider::{AmpMatchingStrategy, SuggestionProvider, SuggestionProviderConstraints}; pub use query::{QueryWithMetricsResult, SuggestionQuery}; pub use store::{InterruptKind, SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder}; pub use suggestion::{raw_suggestion_url_matches, Suggestion}; diff --git a/components/suggest/src/provider.rs b/components/suggest/src/provider.rs index 7a88f39f30..d310e7afa5 100644 --- a/components/suggest/src/provider.rs +++ b/components/suggest/src/provider.rs @@ -130,4 +130,21 @@ pub struct SuggestionProviderConstraints { /// settings record(s). #[uniffi(default = None)] pub exposure_suggestion_types: Option>, + /// Which experiment branch should we use for the AMP provider while the FTS experiment is + /// ongoing? + /// None means the treatment branch. + #[uniffi(default = None)] + pub amp_alternative_matching: Option, +} + +#[derive(Clone, Debug, uniffi::Enum)] +pub enum AmpMatchingStrategy { + /// Disable keywords added via keyword expansion. + /// This eliminates keywords that for terms related to the "real" keywords, for example + /// misspellings like "underarmor" instead of "under armor"'. + NoKeywordExpansion, + /// Use FTS matching against the full keywords, joined together. + FtsAgainstFullKeywords, + /// Use FTS matching against the title field + FtsAgainstTitle, } diff --git a/components/suggest/src/query.rs b/components/suggest/src/query.rs index f094f8c53a..3607549a8f 100644 --- a/components/suggest/src/query.rs +++ b/components/suggest/src/query.rs @@ -131,6 +131,7 @@ impl SuggestionQuery { exposure_suggestion_types: Some( suggestion_types.iter().map(|s| s.to_string()).collect(), ), + ..SuggestionProviderConstraints::default() }), ..Self::default() } diff --git a/components/suggest/src/rs.rs b/components/suggest/src/rs.rs index b74042868a..7dcd76b80f 100644 --- a/components/suggest/src/rs.rs +++ b/components/suggest/src/rs.rs @@ -31,7 +31,7 @@ //! the new suggestion in their results, and return `Suggestion::T` variants //! as needed. -use std::fmt; +use std::{collections::HashSet, fmt}; use remote_settings::{Attachment, RemoteSettingsRecord}; use serde::{Deserialize, Deserializer}; @@ -436,6 +436,23 @@ impl DownloadedSuggestionCommonDetails { }, ) } + + /// Get the full keywords as a single string + pub fn full_keywords_joined(&self) -> String { + let parts: HashSet<_> = self + .full_keywords + .iter() + .flat_map(|(s, _)| s.split_whitespace()) + .collect(); + let mut result = String::new(); + for (i, part) in parts.into_iter().enumerate() { + if i != 0 { + result.push(' '); + } + result.push_str(part); + } + result + } } #[derive(Debug, PartialEq, Eq)] diff --git a/components/suggest/src/schema.rs b/components/suggest/src/schema.rs index 8690ae1521..86931c8c9a 100644 --- a/components/suggest/src/schema.rs +++ b/components/suggest/src/schema.rs @@ -23,7 +23,7 @@ use sql_support::{ /// `clear_database()` by adding their names to `conditional_tables`, unless /// they are cleared via a deletion trigger or there's some other good /// reason not to do so. -pub const VERSION: u32 = 31; +pub const VERSION: u32 = 32; /// The current Suggest database schema. pub const SQL: &str = " @@ -143,6 +143,14 @@ CREATE TRIGGER fakespot_ai AFTER INSERT ON fakespot_custom_details BEGIN WHERE id = new.suggestion_id; END; +CREATE VIRTUAL TABLE IF NOT EXISTS amp_fts USING FTS5( + full_keywords, + title, + content='', + contentless_delete=1, + tokenize=\"porter unicode61 remove_diacritics 2 tokenchars '''-'\" +); + -- DELETE/UPDATE triggers are difficult to implement, since the FTS contents are split between the fakespot_custom_details and suggestions tables. -- If you use an AFTER trigger, then the data from the other table has already been deleted. -- BEFORE triggers are discouraged by the SQLite docs. @@ -597,6 +605,23 @@ CREATE INDEX geonames_alternates_geoname_id ON geonames_alternates(geoname_id); )?; Ok(()) } + 31 => { + // Need to clear the database so that the FTS index will get filled. + clear_database(tx)?; + tx.execute_batch( + " +CREATE VIRTUAL TABLE IF NOT EXISTS amp_fts USING FTS5( + full_keywords, + title, + content='', + contentless_delete=1, + tokenize=\"porter unicode61 remove_diacritics 2 tokenchars '''-'\" +); + + ", + )?; + Ok(()) + } _ => Err(open_database::Error::IncompatibleVersion(version)), } } diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index 0a8eb240f6..7493dbddfc 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -912,7 +912,9 @@ pub(crate) mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; - use crate::{suggestion::FtsMatchInfo, testing::*, SuggestionProvider}; + use crate::{ + provider::AmpMatchingStrategy, suggestion::FtsMatchInfo, testing::*, SuggestionProvider, + }; /// In-memory Suggest store for testing pub(crate) struct TestStore { @@ -1147,6 +1149,128 @@ pub(crate) mod tests { Ok(()) } + #[test] + fn amp_no_keyword_expansion() -> anyhow::Result<()> { + before_each(); + + let store = TestStore::new( + MockRemoteSettingsClient::default() + // Setup the keywords such that: + // * There's a `chicken` keyword, which is not a substring of any full + // keywords (i.e. it was the result of keyword expansion). + // * There's a `los pollos ` keyword with an extra space + .with_record( + "data", + "1234", + los_pollos_amp().merge(json!({ + "keywords": ["los", "los pollos", "los pollos ", "los pollos hermanos", "chicken"], + "full_keywords": [("los pollos", 3), ("los pollos hermanos", 2)], + })) + ) + .with_icon(los_pollos_icon()), + ); + store.ingest(SuggestIngestionConstraints::all_providers()); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::NoKeywordExpansion), + ..SuggestionProviderConstraints::default() + }), + // Should not match, because `chicken` is not a substring of a full keyword. + // i.e. it was added because of keyword expansion. + ..SuggestionQuery::amp("chicken") + }), + vec![], + ); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::NoKeywordExpansion), + ..SuggestionProviderConstraints::default() + }), + // Should match, even though "los pollos " technically is not a substring + // because there's an extra space. The reason these keywords are in the DB is + // because we want to keep showing the current suggestion when the user types + // the space key. + ..SuggestionQuery::amp("los pollos ") + }), + vec![los_pollos_suggestion("los pollos")], + ); + Ok(()) + } + + #[test] + fn amp_fts() -> anyhow::Result<()> { + before_each(); + + let store = TestStore::new( + MockRemoteSettingsClient::default() + // Setup the keywords such that: + // * There's a `chicken` keyword, which is not a substring of any full + // keywords (i.e. it was the result of keyword expansion). + // * There's a `los pollos ` keyword with an extra space + .with_record( + "data", + "1234", + los_pollos_amp().merge(json!({ + "keywords": ["los", "los pollos", "los pollos ", "los pollos hermanos"], + "full_keywords": [("los pollos", 3), ("los pollos hermanos", 1)], + })), + ) + .with_icon(los_pollos_icon()), + ); + store.ingest(SuggestIngestionConstraints::all_providers()); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::FtsAgainstFullKeywords), + ..SuggestionProviderConstraints::default() + }), + // "Hermanos" should match, even though it's not listed in the keywords, + // because this strategy uses an FTS match against the full keyword list. + ..SuggestionQuery::amp("Hermanos") + }), + vec![los_pollos_suggestion("Hermanos")], + ); + Ok(()) + } + + #[test] + fn amp_fts_against_title() -> anyhow::Result<()> { + before_each(); + + let store = TestStore::new( + MockRemoteSettingsClient::default() + // Setup the keywords such that: + // * There's a `chicken` keyword, which is not a substring of any full + // keywords (i.e. it was the result of keyword expansion). + // * There's a `los pollos ` keyword with an extra space + .with_record( + "data", + "1234", + los_pollos_amp().merge(json!({ + "keywords": ["los", "los pollos", "los pollos ", "los pollos hermanos"], + "full_keywords": [("los pollos", 3), ("los pollos hermanos", 1)], + })), + ) + .with_icon(los_pollos_icon()), + ); + store.ingest(SuggestIngestionConstraints::all_providers()); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::FtsAgainstTitle), + ..SuggestionProviderConstraints::default() + }), + // "Albuquerque" should match, even though it's not listed in the keywords, + // because this strategy uses an FTS match against the title + ..SuggestionQuery::amp("Albuquerque") + }), + vec![los_pollos_suggestion("Albuquerque")], + ); + Ok(()) + } + /// Tests ingesting a data attachment containing a single suggestion, /// instead of an array of suggestions. #[test] @@ -2538,6 +2662,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string(), "bbb".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2810,6 +2935,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2847,6 +2973,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2946,6 +3073,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["bbb".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2979,6 +3107,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -3029,6 +3158,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -3063,6 +3193,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); diff --git a/examples/suggest-cli/src/main.rs b/examples/suggest-cli/src/main.rs index 8565e42dd2..015c168c81 100644 --- a/examples/suggest-cli/src/main.rs +++ b/examples/suggest-cli/src/main.rs @@ -9,8 +9,8 @@ use clap::{Parser, Subcommand, ValueEnum}; use remote_settings::RemoteSettingsServer; use suggest::{ - SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder, SuggestionProvider, - SuggestionQuery, + AmpMatchingStrategy, SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder, + SuggestionProvider, SuggestionProviderConstraints, SuggestionQuery, }; static DB_PATH: &str = "suggest.db"; @@ -56,9 +56,31 @@ enum Commands { provider: Option, /// Input to search input: String, + #[clap(long, short)] + amp_matching_strategy: Option, }, } +#[derive(Clone, Debug, ValueEnum)] +enum AmpMatchingStrategyArg { + /// Use keyword matching, without keyword expansion + NoKeyword, + /// Use FTS matching + Fts, + /// Use FTS matching against the title + FtsTitle, +} + +impl Into for AmpMatchingStrategyArg { + fn into(self) -> AmpMatchingStrategy { + match self { + Self::NoKeyword => AmpMatchingStrategy::NoKeywordExpansion, + Self::Fts => AmpMatchingStrategy::FtsAgainstFullKeywords, + Self::FtsTitle => AmpMatchingStrategy::FtsAgainstTitle, + } + } +} + #[derive(Clone, Debug, ValueEnum)] enum SuggestionProviderArg { Amp, @@ -109,7 +131,15 @@ fn main() -> Result<()> { provider, input, fts_match_info, - } => query(&store, provider, input, fts_match_info, cli.verbose), + amp_matching_strategy, + } => query( + &store, + provider, + input, + fts_match_info, + amp_matching_strategy, + cli.verbose, + ), }; Ok(()) } @@ -185,6 +215,7 @@ fn query( provider: Option, input: String, fts_match_info: bool, + amp_matching_strategy: Option, verbose: bool, ) { let query = SuggestionQuery { @@ -193,6 +224,10 @@ fn query( None => SuggestionProvider::all().to_vec(), }, keyword: input, + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: amp_matching_strategy.map(Into::into), + ..SuggestionProviderConstraints::default() + }), ..SuggestionQuery::default() }; let mut results = store