-
Notifications
You must be signed in to change notification settings - Fork 229
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMP FTS experiment logic #6558
base: main
Are you sure you want to change the base?
AMP FTS experiment logic #6558
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,7 @@ use crate::{ | |
fakespot, | ||
geoname::GeonameCache, | ||
pocket::{split_keyword, KeywordConfidence}, | ||
provider::SuggestionProvider, | ||
provider::{AmpMatchingStrategy, SuggestionProvider}, | ||
query::FtsQuery, | ||
rs::{ | ||
DownloadedAmoSuggestion, DownloadedAmpSuggestion, DownloadedAmpWikipediaSuggestion, | ||
|
@@ -321,35 +321,67 @@ impl<'a> SuggestDao<'a> { | |
&self, | ||
query: &SuggestionQuery, | ||
suggestion_type: AmpSuggestionType, | ||
) -> Result<Vec<Suggestion>> { | ||
let strategy = query | ||
.provider_constraints | ||
.as_ref() | ||
.and_then(|c| c.amp_alternative_matching.as_ref()); | ||
match strategy { | ||
None => self.fetch_amp_suggestions_using_keywords(query, suggestion_type, true), | ||
Some(AmpMatchingStrategy::NoKeywordExpansion) => { | ||
self.fetch_amp_suggestions_using_keywords(query, suggestion_type, false) | ||
} | ||
Some(AmpMatchingStrategy::FtsAgainstFullKeywords) => { | ||
self.fetch_amp_suggestions_using_fts(query, suggestion_type, "full_keywords") | ||
} | ||
Some(AmpMatchingStrategy::FtsAgainstTitle) => { | ||
self.fetch_amp_suggestions_using_fts(query, suggestion_type, "title") | ||
} | ||
} | ||
} | ||
|
||
pub fn fetch_amp_suggestions_using_keywords( | ||
&self, | ||
query: &SuggestionQuery, | ||
suggestion_type: AmpSuggestionType, | ||
allow_keyword_expansion: bool, | ||
) -> Result<Vec<Suggestion>> { | ||
let keyword_lowercased = &query.keyword.to_lowercase(); | ||
let provider = match suggestion_type { | ||
AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile, | ||
AmpSuggestionType::Desktop => SuggestionProvider::Amp, | ||
}; | ||
let where_extra = if allow_keyword_expansion { | ||
"" | ||
} else { | ||
"AND INSTR(CONCAT(fk.full_keyword, ' '), k.keyword) != 0" | ||
}; | ||
let suggestions = self.conn.query_rows_and_then_cached( | ||
r#" | ||
SELECT | ||
s.id, | ||
k.rank, | ||
s.title, | ||
s.url, | ||
s.provider, | ||
s.score, | ||
fk.full_keyword | ||
FROM | ||
suggestions s | ||
JOIN | ||
keywords k | ||
ON k.suggestion_id = s.id | ||
LEFT JOIN | ||
full_keywords fk | ||
ON k.full_keyword_id = fk.id | ||
WHERE | ||
s.provider = :provider | ||
AND k.keyword = :keyword | ||
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) | ||
"#, | ||
&format!( | ||
r#" | ||
SELECT | ||
s.id, | ||
k.rank, | ||
s.title, | ||
s.url, | ||
s.provider, | ||
s.score, | ||
fk.full_keyword | ||
FROM | ||
suggestions s | ||
JOIN | ||
keywords k | ||
ON k.suggestion_id = s.id | ||
LEFT JOIN | ||
full_keywords fk | ||
ON k.full_keyword_id = fk.id | ||
WHERE | ||
s.provider = :provider | ||
AND k.keyword = :keyword | ||
{where_extra} | ||
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) | ||
"# | ||
), | ||
named_params! { | ||
":keyword": keyword_lowercased, | ||
":provider": provider | ||
|
@@ -427,6 +459,96 @@ impl<'a> SuggestDao<'a> { | |
Ok(suggestions) | ||
} | ||
|
||
pub fn fetch_amp_suggestions_using_fts( | ||
&self, | ||
query: &SuggestionQuery, | ||
suggestion_type: AmpSuggestionType, | ||
fts_column: &str, | ||
) -> Result<Vec<Suggestion>> { | ||
let fts_query = query.fts_query(); | ||
let match_arg = &fts_query.match_arg; | ||
let provider = match suggestion_type { | ||
AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile, | ||
AmpSuggestionType::Desktop => SuggestionProvider::Amp, | ||
}; | ||
let suggestions = self.conn.query_rows_and_then_cached( | ||
&format!( | ||
r#" | ||
SELECT | ||
s.id, | ||
s.title, | ||
s.url, | ||
s.provider, | ||
s.score | ||
FROM | ||
suggestions s | ||
JOIN | ||
amp_fts fts | ||
ON fts.rowid = s.id | ||
WHERE | ||
s.provider = :provider | ||
AND amp_fts match '{fts_column}: {match_arg}' | ||
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) | ||
ORDER BY rank | ||
LIMIT 1 | ||
"# | ||
), | ||
named_params! { | ||
":provider": provider | ||
}, | ||
|row| -> Result<Suggestion> { | ||
let suggestion_id: i64 = row.get("id")?; | ||
let title = row.get("title")?; | ||
let raw_url: String = row.get("url")?; | ||
let score: f64 = row.get("score")?; | ||
|
||
self.conn.query_row_and_then( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This query and its row callback are the same as the ones in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried factoring this out, but there's actually some differences because the keywords are different. Let's leave the duplicate code in for now, if we end up supporting both exact keywords and AMP, we can refactor later. |
||
r#" | ||
SELECT | ||
amp.advertiser, | ||
amp.block_id, | ||
amp.iab_category, | ||
amp.impression_url, | ||
amp.click_url, | ||
i.data AS icon, | ||
i.mimetype AS icon_mimetype | ||
FROM | ||
amp_custom_details amp | ||
LEFT JOIN | ||
icons i ON amp.icon_id = i.id | ||
WHERE | ||
amp.suggestion_id = :suggestion_id | ||
"#, | ||
named_params! { | ||
":suggestion_id": suggestion_id | ||
}, | ||
|row| { | ||
let cooked_url = cook_raw_suggestion_url(&raw_url); | ||
let raw_click_url = row.get::<_, String>("click_url")?; | ||
let cooked_click_url = cook_raw_suggestion_url(&raw_click_url); | ||
|
||
Ok(Suggestion::Amp { | ||
block_id: row.get("block_id")?, | ||
advertiser: row.get("advertiser")?, | ||
iab_category: row.get("iab_category")?, | ||
title, | ||
url: cooked_url, | ||
raw_url, | ||
full_keyword: query.keyword.clone(), | ||
icon: row.get("icon")?, | ||
icon_mimetype: row.get("icon_mimetype")?, | ||
impression_url: row.get("impression_url")?, | ||
click_url: cooked_click_url, | ||
raw_click_url, | ||
score, | ||
}) | ||
}, | ||
) | ||
}, | ||
)?; | ||
Ok(suggestions) | ||
} | ||
|
||
/// Fetches Suggestions of type Wikipedia provider that match the given query | ||
pub fn fetch_wikipedia_suggestions(&self, query: &SuggestionQuery) -> Result<Vec<Suggestion>> { | ||
let keyword_lowercased = &query.keyword.to_lowercase(); | ||
|
@@ -954,6 +1076,7 @@ impl<'a> SuggestDao<'a> { | |
let mut amp_insert = AmpInsertStatement::new(self.conn)?; | ||
let mut wiki_insert = WikipediaInsertStatement::new(self.conn)?; | ||
let mut keyword_insert = KeywordInsertStatement::new(self.conn)?; | ||
let mut fts_insert = AmpFtsInsertStatement::new(self.conn)?; | ||
for suggestion in suggestions { | ||
self.scope.err_if_interrupted()?; | ||
let common_details = suggestion.common_details(); | ||
|
@@ -974,6 +1097,11 @@ impl<'a> SuggestDao<'a> { | |
wiki_insert.execute(suggestion_id, wikipedia)?; | ||
} | ||
} | ||
fts_insert.execute( | ||
suggestion_id, | ||
&common_details.full_keywords_joined(), | ||
&common_details.title, | ||
)?; | ||
Comment on lines
+1100
to
+1104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, just thought of something: Should this be gated on provider constraints in the ingest constraints? Most people won't be in the experiment, and ideally they wouldn't pay the cost of ingesting into the FTS table. Desktop can easily pass in some ingest constraints for users in the experiment. Maybe it's not a big deal though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My concern with that is that we won't re-ingest records if they've already been ingested, and I couldn't figure out a good way to handle this in It does add some time to ingestion, but not all that much. I put some metrics in the title. Maybe we could store the provider constraints in a column in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems doable: If the constraints are
I had that thought too when I added |
||
let mut full_keyword_inserter = FullKeywordInserter::new(self.conn, suggestion_id); | ||
for keyword in common_details.keywords() { | ||
let full_keyword_id = match (suggestion, keyword.full_keyword) { | ||
|
@@ -1764,6 +1892,30 @@ impl<'conn> KeywordMetricsInsertStatement<'conn> { | |
} | ||
} | ||
|
||
pub(crate) struct AmpFtsInsertStatement<'conn>(rusqlite::Statement<'conn>); | ||
|
||
impl<'conn> AmpFtsInsertStatement<'conn> { | ||
pub(crate) fn new(conn: &'conn Connection) -> Result<Self> { | ||
Ok(Self(conn.prepare( | ||
"INSERT INTO amp_fts(rowid, full_keywords, title) | ||
VALUES(?, ?, ?) | ||
", | ||
)?)) | ||
} | ||
|
||
pub(crate) fn execute( | ||
&mut self, | ||
suggestion_id: i64, | ||
full_keywords: &str, | ||
title: &str, | ||
) -> Result<()> { | ||
self.0 | ||
.execute((suggestion_id, full_keywords, title)) | ||
.with_context("amp fts insert")?; | ||
Ok(()) | ||
} | ||
} | ||
|
||
fn provider_config_meta_key(provider: SuggestionProvider) -> String { | ||
format!("{}{}", PROVIDER_CONFIG_META_KEY_PREFIX, provider as u8) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,4 +130,21 @@ pub struct SuggestionProviderConstraints { | |
/// settings record(s). | ||
#[uniffi(default = None)] | ||
pub exposure_suggestion_types: Option<Vec<String>>, | ||
/// Which experiment branch should we use for the AMP provider while the FTS experiment is | ||
/// ongoing? | ||
/// None means the treatment branch. | ||
#[uniffi(default = None)] | ||
pub amp_alternative_matching: Option<AmpMatchingStrategy>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! 👍 Could you update the doc comment please? (It still mentions zero) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Sorry, in the same spirit of renaming this struct member, would you mind rewording the doc comment so it doesn't talk about experiments? And now that it's an enum, I think it would be fine for the comment to only mention what |
||
} | ||
|
||
#[derive(Clone, Debug, uniffi::Enum)] | ||
pub enum AmpMatchingStrategy { | ||
/// Disable keywords added via keyword expansion. | ||
/// This eliminates keywords that for terms related to the "real" keywords, for example | ||
/// misspellings like "underarmor" instead of "under armor"'. | ||
NoKeywordExpansion, | ||
/// Use FTS matching against the full keywords, joined together. | ||
FtsAgainstFullKeywords, | ||
/// Use FTS matching against the title field | ||
FtsAgainstTitle, | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we'll need an
ORDER BY
andLIMIT 1
for the FTS query. For non-FTS, we get away without them because AMP provides unique keywords, but an FTS query may match many suggestion rows -- at least when matching by title, I'm not as sure about full keywords, but that seems possible too.The obvious criteria to order by is the FTS
rank
, but I think we need to be careful since that boils down to bm25, which is a function of term frequency. That's not very applicable here, at least when matching by full keyword since you're generating thefull_keyword
column in the fts table by concating all full keywords for a suggestion (if I'm reading that right). So we'll end up with a full keyword like "amazon amazon fresh amazon login amazon prime login..." for the first suggestion indata-0000-0200
for example, which will rank highly for an "amazon" search. But maybe that's actually fair? I'm not sure.For now, maybe we could just order by suggestion ID to get a stable sort and hope that works OK. I'm not sure if there's a good answer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great point. I'm thinking we can
rank
and de-dupe the keywords before inserting them. I'm not sure if that will work well or not, but it seems like it's worth a shot.