Skip to content

Commit

Permalink
Merge pull request #102 from guardian/pf/keywords-for-ap
Browse files Browse the repository at this point in the history
AP poller handles keywords
  • Loading branch information
bryophyta authored Jan 21, 2025
2 parents 924c62a + 727939f commit a4585c5
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 19 deletions.
7 changes: 7 additions & 0 deletions ingestion-lambda/src/handler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,11 @@ describe('processKeywords', () => {
'keyword 2',
]);
});

it('should handle arrays of strings, removing duplicates and empty strings', () => {
expect(processKeywords(['keyword1', 'keyword2', 'keyword1', ''])).toEqual([
'keyword1',
'keyword2',
]);
});
});
42 changes: 30 additions & 12 deletions ingestion-lambda/src/handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,38 @@ const isCurlyQuoteFailure = (e: SyntaxError): boolean => {
return !!e.message.match(/Unexpected token '[]'/);
};

function cleanAndDedupeKeywords(keywords: string[]): string[] {
return [
...new Set(
keywords
.map((keyword) => keyword.trim())
.filter((keyword) => keyword.length > 0),
),
];
}

export const processKeywords = (
keywords: string | string[] | undefined,
): string[] => {
if (keywords === undefined) {
return [];
}
if (Array.isArray(keywords)) {
return cleanAndDedupeKeywords(keywords);
}
return cleanAndDedupeKeywords(keywords.split('+'));
};

const safeBodyParse = (body: string): IngestorInputBody => {
try {
return IngestorInputBodySchema.parse(JSON.parse(body));
const json = JSON.parse(body) as Record<string, unknown>;
const preprocessedKeywords = processKeywords(
json.keywords as string | string[] | undefined,
); // if it's not one of these, we probably want to throw an error
return IngestorInputBodySchema.parse({
...json,
keywords: preprocessedKeywords,
});
} catch (e) {
if (e instanceof SyntaxError && isCurlyQuoteFailure(e)) {
console.warn('Stripping badly escaped curly quote');
Expand All @@ -49,17 +78,6 @@ const safeBodyParse = (body: string): IngestorInputBody => {
}
};

export const processKeywords = (keywords: string | undefined): string[] => {
if (keywords === undefined) {
return [];
}
const keywordsArray = keywords
.split('+')
.map((keyword) => keyword.trim())
.filter((keyword) => keyword.length > 0);
return [...new Set(keywordsArray)]; // remove duplicates
};

export const main = async (event: SQSEvent): Promise<SQSBatchResponse> => {
const records = event.Records;

Expand Down
24 changes: 17 additions & 7 deletions poller-lambdas/src/pollers/ap/apPoller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ type FeedItemWithContent = {
contentFromNitf: ContentFromNitf;
};

// https://api.ap.org/media/v/content/feed?page_size=10&in_my_plan=true
// https://api.ap.org/media/v/content/feed?page_size=10&in_my_plan=true&include=*
export const apPoller = (async (secret: SecretValue, input: PollerInput) => {
const baseUrl = 'https://api.ap.org/media/v';
const defaultFeedUrl = `${baseUrl}/content/feed?page_size=10&in_my_plan=true`;
const defaultFeedUrl = `${baseUrl}/content/feed?page_size=10&in_my_plan=true&include=*`;
const apiKey = secret;

const headers = {
Expand All @@ -35,7 +35,9 @@ export const apPoller = (async (secret: SecretValue, input: PollerInput) => {
`Received feed with ${feed.data?.current_item_count} items at ${timeReceived.toISOString()}`,
);

const valueForNextPoll = feed.data?.next_page ?? defaultFeedUrl;
const valueForNextPoll = feed.data?.next_page
? `${feed.data.next_page}&include=*`
: defaultFeedUrl;

const feedItems = feed.data?.items
?.map(({ item }) => item)
Expand Down Expand Up @@ -78,7 +80,7 @@ export const apPoller = (async (secret: SecretValue, input: PollerInput) => {
if (maybeNitfUrl === undefined) {
console.log(
JSON.stringify({
uud: feedItem.altids?.etag,
uuid: feedItem.altids?.etag,
message: `No NITF rendition found for AP item: ${feedItem.altids?.etag}; excluding from feed.`,
}),
);
Expand Down Expand Up @@ -165,6 +167,8 @@ function itemWithContentToDesiredOutput({
versioncreated,
bylines,
ednote,
subject,
keywords,
} = feedItem;

const { abstract, bodyContentHtml } = contentFromNitf;
Expand All @@ -173,20 +177,26 @@ function itemWithContentToDesiredOutput({
? [...bylines].map((byline) => byline.by).join(', ')
: contentFromNitf.byline;

const directSubjects =
subject?.filter((s) => s.rels?.includes('direct')).map((s) => s.name) ?? [];

const keywordsAsArray = keywords?.flatMap((k) => k.split(' ')) ?? [];

const amalgamatedKeywords = [...directSubjects, ...keywordsAsArray];

return {
externalId,
body: {
'source-feed': 'AP-Newswires',
version: feedItem.version?.toString() ?? '0',
type: type,
status: pubstatus,
firstVersion:
firstcreated /** @todo: we should double-check that these line up once we've got the FIP feed back */,
firstVersion: firstcreated,
versionCreated: versioncreated,
headline: title ?? headline ?? contentFromNitf.headline,
byline: bylineToUse,
priority: editorialpriority,
keywords: [],
keywords: amalgamatedKeywords,
body_text: bodyContentHtml,
abstract,
originalContentText: originalXmlContent,
Expand Down

0 comments on commit a4585c5

Please sign in to comment.