Skip to content

Commit

Permalink
AMP FTS experiment logic
Browse files Browse the repository at this point in the history
This sets it up so we always ingest the FTS data and use the
`SuggestionProviderConstraints` passed to the query to determine how to
perform the query.  This seems like the simplest approach and it doesn't
increase ingestion time that much.  The benchmarks on my machine went
from 339.29 ms to 465.60 ms.
  • Loading branch information
bendk committed Jan 15, 2025
1 parent fc22cdb commit e9f0b48
Show file tree
Hide file tree
Showing 8 changed files with 408 additions and 30 deletions.
198 changes: 175 additions & 23 deletions components/suggest/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use crate::{
fakespot,
geoname::GeonameCache,
pocket::{split_keyword, KeywordConfidence},
provider::SuggestionProvider,
provider::{AmpMatchingStrategy, SuggestionProvider},
query::FtsQuery,
rs::{
DownloadedAmoSuggestion, DownloadedAmpSuggestion, DownloadedAmpWikipediaSuggestion,
Expand Down Expand Up @@ -321,35 +321,67 @@ impl<'a> SuggestDao<'a> {
&self,
query: &SuggestionQuery,
suggestion_type: AmpSuggestionType,
) -> Result<Vec<Suggestion>> {
let strategy = query
.provider_constraints
.as_ref()
.and_then(|c| c.amp_alternative_matching.as_ref());
match strategy {
None => self.fetch_amp_suggestions_using_keywords(query, suggestion_type, true),
Some(AmpMatchingStrategy::NoKeywordExpansion) => {
self.fetch_amp_suggestions_using_keywords(query, suggestion_type, false)
}
Some(AmpMatchingStrategy::Fts) => {
self.fetch_amp_suggestions_using_fts(query, suggestion_type, "full_keywords")
}
Some(AmpMatchingStrategy::FtsAgainstTitle) => {
self.fetch_amp_suggestions_using_fts(query, suggestion_type, "title")
}
}
}

pub fn fetch_amp_suggestions_using_keywords(
&self,
query: &SuggestionQuery,
suggestion_type: AmpSuggestionType,
allow_keyword_expansion: bool,
) -> Result<Vec<Suggestion>> {
let keyword_lowercased = &query.keyword.to_lowercase();
let provider = match suggestion_type {
AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile,
AmpSuggestionType::Desktop => SuggestionProvider::Amp,
};
let where_extra = if allow_keyword_expansion {
""
} else {
"AND INSTR(CONCAT(fk.full_keyword, ' '), k.keyword) != 0"
};
let suggestions = self.conn.query_rows_and_then_cached(
r#"
SELECT
s.id,
k.rank,
s.title,
s.url,
s.provider,
s.score,
fk.full_keyword
FROM
suggestions s
JOIN
keywords k
ON k.suggestion_id = s.id
LEFT JOIN
full_keywords fk
ON k.full_keyword_id = fk.id
WHERE
s.provider = :provider
AND k.keyword = :keyword
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url)
"#,
&format!(
r#"
SELECT
s.id,
k.rank,
s.title,
s.url,
s.provider,
s.score,
fk.full_keyword
FROM
suggestions s
JOIN
keywords k
ON k.suggestion_id = s.id
LEFT JOIN
full_keywords fk
ON k.full_keyword_id = fk.id
WHERE
s.provider = :provider
AND k.keyword = :keyword
{where_extra}
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url)
"#
),
named_params! {
":keyword": keyword_lowercased,
":provider": provider
Expand Down Expand Up @@ -427,6 +459,96 @@ impl<'a> SuggestDao<'a> {
Ok(suggestions)
}

pub fn fetch_amp_suggestions_using_fts(
&self,
query: &SuggestionQuery,
suggestion_type: AmpSuggestionType,
fts_column: &str,
) -> Result<Vec<Suggestion>> {
let fts_query = query.fts_query();
let match_arg = &fts_query.match_arg;
let provider = match suggestion_type {
AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile,
AmpSuggestionType::Desktop => SuggestionProvider::Amp,
};
let suggestions = self.conn.query_rows_and_then_cached(
&format!(
r#"
SELECT
s.id,
s.title,
s.url,
s.provider,
s.score
FROM
suggestions s
JOIN
amp_fts fts
ON fts.rowid = s.id
WHERE
s.provider = :provider
AND amp_fts match '{fts_column}: {match_arg}'
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url)
ORDER BY rank
LIMIT 1
"#
),
named_params! {
":provider": provider
},
|row| -> Result<Suggestion> {
let suggestion_id: i64 = row.get("id")?;
let title = row.get("title")?;
let raw_url: String = row.get("url")?;
let score: f64 = row.get("score")?;

self.conn.query_row_and_then(
r#"
SELECT
amp.advertiser,
amp.block_id,
amp.iab_category,
amp.impression_url,
amp.click_url,
i.data AS icon,
i.mimetype AS icon_mimetype
FROM
amp_custom_details amp
LEFT JOIN
icons i ON amp.icon_id = i.id
WHERE
amp.suggestion_id = :suggestion_id
"#,
named_params! {
":suggestion_id": suggestion_id
},
|row| {
let cooked_url = cook_raw_suggestion_url(&raw_url);
let raw_click_url = row.get::<_, String>("click_url")?;
let cooked_click_url = cook_raw_suggestion_url(&raw_click_url);

Ok(Suggestion::Amp {
block_id: row.get("block_id")?,
advertiser: row.get("advertiser")?,
iab_category: row.get("iab_category")?,
title,
url: cooked_url,
raw_url,
full_keyword: query.keyword.clone(),
icon: row.get("icon")?,
icon_mimetype: row.get("icon_mimetype")?,
impression_url: row.get("impression_url")?,
click_url: cooked_click_url,
raw_click_url,
score,
})
},
)
},
)?;
Ok(suggestions)
}

/// Fetches Suggestions of type Wikipedia provider that match the given query
pub fn fetch_wikipedia_suggestions(&self, query: &SuggestionQuery) -> Result<Vec<Suggestion>> {
let keyword_lowercased = &query.keyword.to_lowercase();
Expand Down Expand Up @@ -954,6 +1076,7 @@ impl<'a> SuggestDao<'a> {
let mut amp_insert = AmpInsertStatement::new(self.conn)?;
let mut wiki_insert = WikipediaInsertStatement::new(self.conn)?;
let mut keyword_insert = KeywordInsertStatement::new(self.conn)?;
let mut fts_insert = AmpFtsInsertStatement::new(self.conn)?;
for suggestion in suggestions {
self.scope.err_if_interrupted()?;
let common_details = suggestion.common_details();
Expand All @@ -974,6 +1097,11 @@ impl<'a> SuggestDao<'a> {
wiki_insert.execute(suggestion_id, wikipedia)?;
}
}
fts_insert.execute(
suggestion_id,
&common_details.full_keywords_joined(),
&common_details.title,
)?;
let mut full_keyword_inserter = FullKeywordInserter::new(self.conn, suggestion_id);
for keyword in common_details.keywords() {
let full_keyword_id = match (suggestion, keyword.full_keyword) {
Expand Down Expand Up @@ -1764,6 +1892,30 @@ impl<'conn> KeywordMetricsInsertStatement<'conn> {
}
}

pub(crate) struct AmpFtsInsertStatement<'conn>(rusqlite::Statement<'conn>);

impl<'conn> AmpFtsInsertStatement<'conn> {
pub(crate) fn new(conn: &'conn Connection) -> Result<Self> {
Ok(Self(conn.prepare(
"INSERT INTO amp_fts(rowid, full_keywords, title)
VALUES(?, ?, ?)
",
)?))
}

pub(crate) fn execute(
&mut self,
suggestion_id: i64,
full_keywords: &str,
title: &str,
) -> Result<()> {
self.0
.execute((suggestion_id, full_keywords, title))
.with_context("keyword metrics insert")?;
Ok(())
}
}

fn provider_config_meta_key(provider: SuggestionProvider) -> String {
format!("{}{}", PROVIDER_CONFIG_META_KEY_PREFIX, provider as u8)
}
2 changes: 1 addition & 1 deletion components/suggest/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pub use config::{SuggestGlobalConfig, SuggestProviderConfig};
pub use error::{Error, SuggestApiError};
pub use geoname::{Geoname, GeonameMatch, GeonameType};
pub use metrics::{LabeledTimingSample, SuggestIngestionMetrics};
pub use provider::{SuggestionProvider, SuggestionProviderConstraints};
pub use provider::{AmpMatchingStrategy, SuggestionProvider, SuggestionProviderConstraints};
pub use query::{QueryWithMetricsResult, SuggestionQuery};
pub use store::{InterruptKind, SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder};
pub use suggestion::{raw_suggestion_url_matches, Suggestion};
Expand Down
17 changes: 17 additions & 0 deletions components/suggest/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,21 @@ pub struct SuggestionProviderConstraints {
/// settings record(s).
#[uniffi(default = None)]
pub exposure_suggestion_types: Option<Vec<String>>,
/// Which experiment branch should we use for the AMP provider while the FTS experiment is
/// ongoing?
/// `0` means the treatment branch.
#[uniffi(default = None)]
pub amp_alternative_matching: Option<AmpMatchingStrategy>,
}

#[derive(Clone, Debug, uniffi::Enum)]
pub enum AmpMatchingStrategy {
/// Disable keywords added via keyword expansion.
/// This eliminates keywords that for terms related to the "real" keywords, for example
/// misspellings like "underarmor" instead of "under armor"'.
NoKeywordExpansion,
/// Use FTS matching against the full keywords, joined together.
Fts,
/// Use FTS matching against the title field
FtsAgainstTitle,
}
1 change: 1 addition & 0 deletions components/suggest/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ impl SuggestionQuery {
exposure_suggestion_types: Some(
suggestion_types.iter().map(|s| s.to_string()).collect(),
),
..SuggestionProviderConstraints::default()
}),
..Self::default()
}
Expand Down
15 changes: 14 additions & 1 deletion components/suggest/src/rs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
//! the new suggestion in their results, and return `Suggestion::T` variants
//! as needed.
use std::fmt;
use std::{collections::HashSet, fmt};

use remote_settings::{Attachment, RemoteSettingsRecord};
use serde::{Deserialize, Deserializer};
Expand Down Expand Up @@ -436,6 +436,19 @@ impl DownloadedSuggestionCommonDetails {
},
)
}

/// Get the full keywords as a single string
pub fn full_keywords_joined(&self) -> String {
let parts: HashSet<_> = self.full_keywords.iter().map(|(s, _)| s.as_str()).collect();
let mut result = String::new();
for (i, part) in parts.into_iter().enumerate() {
if i == 0 {
result.push(' ');
}
result.push_str(part);
}
result
}
}

#[derive(Debug, PartialEq, Eq)]
Expand Down
27 changes: 26 additions & 1 deletion components/suggest/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use sql_support::{
/// `clear_database()` by adding their names to `conditional_tables`, unless
/// they are cleared via a deletion trigger or there's some other good
/// reason not to do so.
pub const VERSION: u32 = 31;
pub const VERSION: u32 = 32;

/// The current Suggest database schema.
pub const SQL: &str = "
Expand Down Expand Up @@ -143,6 +143,14 @@ CREATE TRIGGER fakespot_ai AFTER INSERT ON fakespot_custom_details BEGIN
WHERE id = new.suggestion_id;
END;
CREATE VIRTUAL TABLE IF NOT EXISTS amp_fts USING FTS5(
full_keywords,
title,
content='',
contentless_delete=1,
tokenize=\"porter unicode61 remove_diacritics 2 tokenchars '''-'\"
);
-- DELETE/UPDATE triggers are difficult to implement, since the FTS contents are split between the fakespot_custom_details and suggestions tables.
-- If you use an AFTER trigger, then the data from the other table has already been deleted.
-- BEFORE triggers are discouraged by the SQLite docs.
Expand Down Expand Up @@ -597,6 +605,23 @@ CREATE INDEX geonames_alternates_geoname_id ON geonames_alternates(geoname_id);
)?;
Ok(())
}
31 => {
// Need to clear the database so that the FTS index will get filled.
clear_database(tx)?;
tx.execute_batch(
"
CREATE VIRTUAL TABLE IF NOT EXISTS amp_fts USING FTS5(
full_keywords,
title,
content='',
contentless_delete=1,
tokenize=\"porter unicode61 remove_diacritics 2 tokenchars '''-'\"
);
",
)?;
Ok(())
}
_ => Err(open_database::Error::IncompatibleVersion(version)),
}
}
Expand Down
Loading

0 comments on commit e9f0b48

Please sign in to comment.