From 1a9d743ec19bf2be341c66ce553803ef7b63e9f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Wed, 13 Nov 2024 13:55:06 +0100 Subject: [PATCH 1/6] WIP --- .../U1731052735__llm-prompt-history.sql | 0 ...484783__drop-member-deprecated-columns.sql | 0 .../V1731052735__llm-prompt-history.sql | 62 +++++ ...484783__drop-member-deprecated-columns.sql | 5 + pnpm-lock.yaml | 95 ++++---- .../merge_suggestions_worker/package.json | 1 + .../src/activities.ts | 9 +- .../src/activities/common.ts | 78 +------ .../src/activities/memberMergeSuggestions.ts | 2 + .../organizationMergeSuggestions.ts | 1 + .../merge_suggestions_worker/src/types.ts | 25 --- .../src/workflows/mergeMembersWithLLM.ts | 50 +---- .../workflows/mergeOrganizationsWithLLM.ts | 33 +-- .../workflows/testMergingEntitiesWithLLM.ts | 24 +- .../members_enrichment_worker/package.json | 2 + .../src/activities.ts | 2 + .../src/activities/enrichment.ts | 134 ++++++++++- .../src/activities/getMembers.ts | 2 +- .../lf-auth0/authenticateLFAuth0.ts | 2 +- .../src/activities/lf-auth0/enrichLFAuth0.ts | 2 +- .../lf-auth0/getEnrichmentLFAuth0.ts | 2 +- .../lf-auth0/getLFIDEnrichableMembers.ts | 2 +- .../activities/lf-auth0/githubIdentities.ts | 2 +- .../src/activities/syncEnrichedData.ts | 2 +- .../members_enrichment_worker/src/main.ts | 39 +--- .../src/schedules/getMembersToEnrich.ts | 2 +- .../members_enrichment_worker/src/service.ts | 38 ++++ .../src/sources/crustdata/service.ts | 9 +- .../src/workflows/enrichMember.ts | 17 +- services/archetypes/worker/src/index.ts | 88 ++++---- services/libs/common_services/package.json | 1 + .../src/repos/llmPromptHistory.repo.ts | 31 +++ .../common_services/src/services/index.ts | 1 + .../src/services/llm.service.ts | 211 ++++++++++++++++++ .../llmSuggestionVerdicts.repo.ts | 43 ---- .../memberMergeSuggestions.repo.ts | 17 +- .../organizationMergeSuggestions.repo.ts | 32 +-- .../members_enrichment_worker/index.ts | 79 ++++++- services/libs/types/src/enums/activities.ts | 1 - services/libs/types/src/enums/index.ts | 2 +- services/libs/types/src/enums/llm.ts | 10 + services/libs/types/src/enums/merging.ts | 5 - services/libs/types/src/index.ts | 2 + services/libs/types/src/llm.ts | 73 ++++++ services/libs/types/src/members.ts | 2 + services/libs/types/src/merging.ts | 15 -- services/libs/types/src/organizations.ts | 1 + services/libs/types/src/premium/enrichment.ts | 22 ++ 48 files changed, 843 insertions(+), 435 deletions(-) create mode 100644 backend/src/database/migrations/U1731052735__llm-prompt-history.sql create mode 100644 backend/src/database/migrations/U1731484783__drop-member-deprecated-columns.sql create mode 100644 backend/src/database/migrations/V1731052735__llm-prompt-history.sql create mode 100644 backend/src/database/migrations/V1731484783__drop-member-deprecated-columns.sql create mode 100644 services/apps/premium/members_enrichment_worker/src/service.ts create mode 100644 services/libs/common_services/src/repos/llmPromptHistory.repo.ts create mode 100644 services/libs/common_services/src/services/llm.service.ts delete mode 100644 services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts delete mode 100644 services/libs/types/src/enums/activities.ts create mode 100644 services/libs/types/src/enums/llm.ts create mode 100644 services/libs/types/src/llm.ts diff --git a/backend/src/database/migrations/U1731052735__llm-prompt-history.sql b/backend/src/database/migrations/U1731052735__llm-prompt-history.sql new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/src/database/migrations/U1731484783__drop-member-deprecated-columns.sql b/backend/src/database/migrations/U1731484783__drop-member-deprecated-columns.sql new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/src/database/migrations/V1731052735__llm-prompt-history.sql b/backend/src/database/migrations/V1731052735__llm-prompt-history.sql new file mode 100644 index 0000000000..19964fb7c3 --- /dev/null +++ b/backend/src/database/migrations/V1731052735__llm-prompt-history.sql @@ -0,0 +1,62 @@ +create table "llmPromptHistory" ( + id bigserial primary key, + type varchar(255) not null, + model text not null, + "entityId" text null, + metadata jsonb null, + prompt text not null, + answer text not null, + "inputTokenCount" int not null, + "outputTokenCount" int not null, + "responseTimeSeconds" decimal not null, + "createdAt" timestamptz not null default now() +); + +create index "ix_llmPromptHistory_type_entityId" on "llmPromptHistory"("type", "entityId"); +create index "ix_llmPromptHistory_entityId" on "llmPromptHistory"("entityId"); +create index "ix_llmPromptHistory_type" on "llmPromptHistory"("type"); +create index "ix_llmPromptHistory_secondaryId" on "llmPromptHistory" (((metadata->>'secondaryId')::uuid)) where type in ('organization_merge_suggestion', 'member_merge_suggestion'); + +insert into "llmPromptHistory"(type, model, "entityId", metadata, prompt, answer, "inputTokenCount", "outputTokenCount", "responseTimeSeconds") +select 'organization_merge_suggestion', + model, + "primaryId", + json_build_object( + 'secondaryId', "secondaryId" + ), + prompt, + verdict, + "inputTokenCount", + "outputTokenCount", + "responseTimeSeconds" +from "llmSuggestionVerdicts" +where type = 'organization'; + +delete from "llmSuggestionVerdicts" where type = 'organization'; + +insert into "llmPromptHistory"(type, model, "entityId", metadata, prompt, answer, "inputTokenCount", "outputTokenCount", "responseTimeSeconds") +select 'member_merge_suggestion', + model, + "primaryId", + json_build_object( + 'secondaryId', "secondaryId" + ), + prompt, + verdict, + "inputTokenCount", + "outputTokenCount", + "responseTimeSeconds" +from "llmSuggestionVerdicts" +where type = 'member'; + +delete from "llmSuggestionVerdicts" where type = 'member'; + +do +$$ + begin + if (select count(*) from "llmSuggestionVerdicts") > 0 then + raise exception 'Table llmSuggestionVerdicts is not empty - contains % rows', (select count(*) from "llmSuggestionVerdicts"); + end if; + drop table "llmSuggestionVerdicts"; + end +$$; \ No newline at end of file diff --git a/backend/src/database/migrations/V1731484783__drop-member-deprecated-columns.sql b/backend/src/database/migrations/V1731484783__drop-member-deprecated-columns.sql new file mode 100644 index 0000000000..257a90ace7 --- /dev/null +++ b/backend/src/database/migrations/V1731484783__drop-member-deprecated-columns.sql @@ -0,0 +1,5 @@ +alter table members + drop column "oldEmails"; + +alter table members + drop column "oldWeakIdentities"; \ No newline at end of file diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 010f0cf60f..872d4a8f68 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1058,6 +1058,9 @@ importers: '@crowd/common': specifier: workspace:* version: link:../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../libs/common_services '@crowd/data-access-layer': specifier: workspace:* version: link:../../libs/data-access-layer @@ -1110,6 +1113,9 @@ importers: '@crowd/common': specifier: workspace:* version: link:../../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../../libs/common_services '@crowd/data-access-layer': specifier: workspace:* version: link:../../../libs/data-access-layer @@ -1704,6 +1710,9 @@ importers: services/libs/common_services: dependencies: + '@aws-sdk/client-bedrock-runtime': + specifier: ^3.572.0 + version: 3.572.0 '@crowd/common': specifier: workspace:* version: link:../common @@ -9990,8 +9999,8 @@ snapshots: dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sso-oidc': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10171,11 +10180,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sso-oidc@3.572.0(@aws-sdk/client-sts@3.572.0)': + '@aws-sdk/client-sso-oidc@3.572.0': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10214,7 +10223,6 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: - - '@aws-sdk/client-sts' - aws-crt '@aws-sdk/client-sso@3.556.0': @@ -10347,11 +10355,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sts@3.572.0': + '@aws-sdk/client-sts@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) + '@aws-sdk/client-sso-oidc': 3.572.0 '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10390,6 +10398,7 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: + - '@aws-sdk/client-sso-oidc' - aws-crt '@aws-sdk/core@3.556.0': @@ -10469,7 +10478,7 @@ snapshots: '@aws-sdk/credential-provider-ini@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/credential-provider-env': 3.568.0 '@aws-sdk/credential-provider-process': 3.572.0 '@aws-sdk/credential-provider-sso': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) @@ -10575,7 +10584,7 @@ snapshots: '@aws-sdk/credential-provider-web-identity@3.568.0(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/types': 2.12.0 @@ -10720,7 +10729,7 @@ snapshots: '@aws-sdk/token-providers@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': dependencies: - '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) + '@aws-sdk/client-sso-oidc': 3.572.0 '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/shared-ini-file-loader': 2.4.0 @@ -10823,7 +10832,7 @@ snapshots: '@babel/traverse': 7.24.1 '@babel/types': 7.24.0 convert-source-map: 2.0.0 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -10884,7 +10893,7 @@ snapshots: '@babel/core': 7.24.4 '@babel/helper-compilation-targets': 7.23.6 '@babel/helper-plugin-utils': 7.24.0 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) lodash.debounce: 4.0.8 resolve: 1.22.8 transitivePeerDependencies: @@ -11539,7 +11548,7 @@ snapshots: '@babel/helper-split-export-declaration': 7.22.6 '@babel/parser': 7.24.4 '@babel/types': 7.24.0 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -11554,7 +11563,7 @@ snapshots: '@babel/helper-split-export-declaration': 7.22.6 '@babel/parser': 7.24.4 '@babel/types': 7.24.0 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -11730,7 +11739,7 @@ snapshots: '@eslint/eslintrc@2.1.4': dependencies: ajv: 6.12.6 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) espree: 9.6.1 globals: 13.24.0 ignore: 5.3.1 @@ -11844,7 +11853,7 @@ snapshots: '@humanwhocodes/config-array@0.11.14': dependencies: '@humanwhocodes/object-schema': 2.0.3 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) minimatch: 3.1.2 transitivePeerDependencies: - supports-color @@ -12362,7 +12371,7 @@ snapshots: '@opensearch-project/opensearch@2.11.0': dependencies: aws4: 1.12.0 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) hpagent: 1.2.0 json11: 1.1.2 ms: 2.1.3 @@ -12691,7 +12700,7 @@ snapshots: async: 3.2.5 chalk: 3.0.0 dayjs: 1.8.36 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) eventemitter2: 5.0.1 fast-json-patch: 3.1.1 fclone: 1.0.11 @@ -12711,7 +12720,7 @@ snapshots: '@opencensus/core': 0.0.9 '@opencensus/propagation-b3': 0.0.8 async: 2.6.4 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) eventemitter2: 6.4.9 require-in-the-middle: 5.2.0 semver: 7.5.4 @@ -12724,7 +12733,7 @@ snapshots: '@pm2/js-api@0.8.0(bufferutil@4.0.8)(utf-8-validate@5.0.10)': dependencies: async: 2.6.4 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) eventemitter2: 6.4.9 extrareqp2: 1.0.0(debug@4.3.4) ws: 7.5.9(bufferutil@4.0.8)(utf-8-validate@5.0.10) @@ -13265,7 +13274,7 @@ snapshots: '@superfaceai/parser': 1.2.0 abort-controller: 3.0.0 cross-fetch: 3.1.8(encoding@0.1.13) - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) isomorphic-form-data: 2.0.0 vm2: 3.9.19 transitivePeerDependencies: @@ -13276,7 +13285,7 @@ snapshots: dependencies: '@superfaceai/ast': 1.2.0 '@types/debug': 4.1.12 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) typescript: 4.9.5 transitivePeerDependencies: - supports-color @@ -13671,7 +13680,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/type-utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) eslint: 8.57.0 graphemer: 1.4.0 ignore: 5.3.1 @@ -13708,7 +13717,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/types': 5.62.0 '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) eslint: 8.57.0 optionalDependencies: typescript: 5.6.3 @@ -13742,7 +13751,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) eslint: 8.57.0 tsutils: 3.21.0(typescript@5.6.3) optionalDependencies: @@ -13770,7 +13779,7 @@ snapshots: dependencies: '@typescript-eslint/types': 5.62.0 '@typescript-eslint/visitor-keys': 5.62.0 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) globby: 11.1.0 is-glob: 4.0.3 semver: 7.6.0 @@ -14939,10 +14948,6 @@ snapshots: optionalDependencies: supports-color: 5.5.0 - debug@4.3.4: - dependencies: - ms: 2.1.2 - debug@4.3.4(supports-color@5.5.0): dependencies: ms: 2.1.2 @@ -15205,7 +15210,7 @@ snapshots: base64id: 2.0.0 cookie: 0.4.2 cors: 2.8.5 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) engine.io-parser: 5.2.2 ws: 8.11.0(bufferutil@4.0.8)(utf-8-validate@5.0.10) transitivePeerDependencies: @@ -15518,7 +15523,7 @@ snapshots: ajv: 6.12.6 chalk: 4.1.2 cross-spawn: 7.0.3 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) doctrine: 3.0.0 escape-string-regexp: 4.0.0 eslint-scope: 7.2.2 @@ -15829,7 +15834,7 @@ snapshots: follow-redirects@1.15.6(debug@4.3.4): optionalDependencies: - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) for-each@0.3.3: dependencies: @@ -17096,7 +17101,7 @@ snapshots: dependencies: '@types/express': 4.17.21 '@types/jsonwebtoken': 9.0.6 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) jose: 4.15.5 limiter: 1.1.5 lru-memoizer: 2.2.0 @@ -18168,7 +18173,7 @@ snapshots: pm2-axon-rpc@0.7.1: dependencies: - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -18176,7 +18181,7 @@ snapshots: dependencies: amp: 0.3.1 amp-message: 0.1.2 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) escape-string-regexp: 4.0.0 transitivePeerDependencies: - supports-color @@ -18193,7 +18198,7 @@ snapshots: pm2-sysmonit@1.2.8: dependencies: async: 3.2.5 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) pidusage: 2.0.21 systeminformation: 5.22.7 tx2: 1.0.5 @@ -18215,7 +18220,7 @@ snapshots: commander: 2.15.1 croner: 4.1.97 dayjs: 1.11.11 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) enquirer: 2.3.6 eventemitter2: 5.0.1 fclone: 1.0.11 @@ -18426,7 +18431,7 @@ snapshots: command-line-usage: 6.1.3 config: 3.3.11 configstore: 5.0.1 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) editor: 1.0.0 enquirer: 2.4.1 form-data: 4.0.0 @@ -18552,7 +18557,7 @@ snapshots: require-in-the-middle@5.2.0: dependencies: - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) module-details-from-path: 1.0.3 resolve: 1.22.8 transitivePeerDependencies: @@ -18763,7 +18768,7 @@ snapshots: dependencies: '@types/debug': 4.1.12 '@types/validator': 13.11.9 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) dottie: 2.0.6 inflection: 1.13.4 lodash: 4.17.21 @@ -18899,7 +18904,7 @@ snapshots: socket.io-adapter@2.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10): dependencies: - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) ws: 8.11.0(bufferutil@4.0.8)(utf-8-validate@5.0.10) transitivePeerDependencies: - bufferutil @@ -18909,7 +18914,7 @@ snapshots: socket.io-parser@4.2.4: dependencies: '@socket.io/component-emitter': 3.1.2 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -18918,7 +18923,7 @@ snapshots: accepts: 1.3.8 base64id: 2.0.0 cors: 2.8.5 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) engine.io: 6.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-adapter: 2.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-parser: 4.2.4 @@ -19121,7 +19126,7 @@ snapshots: dependencies: component-emitter: 1.3.1 cookiejar: 2.1.4 - debug: 4.3.4 + debug: 4.3.4(supports-color@5.5.0) fast-safe-stringify: 2.1.1 form-data: 4.0.0 formidable: 2.1.2 diff --git a/services/apps/merge_suggestions_worker/package.json b/services/apps/merge_suggestions_worker/package.json index 94198d4ea4..3ca639f251 100644 --- a/services/apps/merge_suggestions_worker/package.json +++ b/services/apps/merge_suggestions_worker/package.json @@ -16,6 +16,7 @@ "@crowd/archetype-standard": "workspace:*", "@crowd/archetype-worker": "workspace:*", "@crowd/common": "workspace:*", + "@crowd/common_services": "workspace:*", "@crowd/data-access-layer": "workspace:*", "@crowd/feature-flags": "workspace:*", "@crowd/logging": "workspace:*", diff --git a/services/apps/merge_suggestions_worker/src/activities.ts b/services/apps/merge_suggestions_worker/src/activities.ts index fe3cde744d..3f26286967 100644 --- a/services/apps/merge_suggestions_worker/src/activities.ts +++ b/services/apps/merge_suggestions_worker/src/activities.ts @@ -1,10 +1,4 @@ -import { - getAllTenants, - getLLMResult, - mergeMembers, - mergeOrganizations, - saveLLMVerdict, -} from './activities/common' +import { getAllTenants, getLLMResult, mergeMembers, mergeOrganizations } from './activities/common' import { addMemberToMerge, findTenantsLatestMemberSuggestionGeneratedAt, @@ -41,7 +35,6 @@ export { getOrganizationsForLLMConsumption, getRawOrganizationMergeSuggestions, getRawMemberMergeSuggestions, - saveLLMVerdict, mergeMembers, mergeOrganizations, } diff --git a/services/apps/merge_suggestions_worker/src/activities/common.ts b/services/apps/merge_suggestions_worker/src/activities/common.ts index c55cfdb9e4..8bf077ae62 100644 --- a/services/apps/merge_suggestions_worker/src/activities/common.ts +++ b/services/apps/merge_suggestions_worker/src/activities/common.ts @@ -1,21 +1,19 @@ /* eslint-disable @typescript-eslint/no-explicit-any */ -import { BedrockRuntimeClient, InvokeModelCommand } from '@aws-sdk/client-bedrock-runtime' import axios from 'axios' -import { performance } from 'perf_hooks' +import { LlmService } from '@crowd/common_services' import { ITenant } from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker//types' -import LLMSuggestionVerdictsRepository from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo' import TenantRepository from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker/tenant.repo' import { isFeatureEnabled } from '@crowd/feature-flags' import { FeatureFlag, ILLMConsumableMember, ILLMConsumableOrganization, - ILLMSuggestionVerdict, + ILlmResult, + LlmQueryType, } from '@crowd/types' import { svc } from '../main' -import { ILLMResult } from '../types' export async function getAllTenants(): Promise { const tenantRepository = new TenantRepository(svc.postgres.writer.connection(), svc.log) @@ -43,74 +41,22 @@ export async function getAllTenants(): Promise { } export async function getLLMResult( + type: LlmQueryType.MEMBER_MERGE | LlmQueryType.ORGANIZATION_MERGE, suggestion: ILLMConsumableMember[] | ILLMConsumableOrganization[], - modelId: string, - prompt: string, - region: string, - modelSpecificArgs: any, -): Promise { - if (suggestion.length !== 2) { - console.log(suggestion) - throw new Error('Exactly 2 entities are required for LLM comparison') - } - const client = new BedrockRuntimeClient({ - credentials: { +): Promise> { + const llmService = new LlmService( + svc.postgres.writer, + { accessKeyId: process.env['CROWD_AWS_BEDROCK_ACCESS_KEY_ID'], secretAccessKey: process.env['CROWD_AWS_BEDROCK_SECRET_ACCESS_KEY'], }, - region, - }) - - const start = performance.now() - - const end = () => { - const end = performance.now() - const duration = end - start - return Math.ceil(duration / 1000) - } - - const fullPrompt = `Your task is to analyze the following two json documents. ${JSON.stringify( - suggestion, - )} . ${prompt}` - - const command = new InvokeModelCommand({ - body: JSON.stringify({ - messages: [ - { - role: 'user', - content: [ - { - type: 'text', - text: fullPrompt, - }, - ], - }, - ], - ...modelSpecificArgs, - }), - modelId, - accept: 'application/json', - contentType: 'application/json', - }) - - const res = await client.send(command) - - return { - body: JSON.parse(res.body.transformToString()), - prompt: fullPrompt, - modelSpecificArgs, - responseTimeSeconds: end(), - } -} - -export async function saveLLMVerdict(verdict: ILLMSuggestionVerdict): Promise { - const llmVerdictRepository = new LLMSuggestionVerdictsRepository( - svc.postgres.writer.connection(), svc.log, ) - return llmVerdictRepository.saveLLMVerdict(verdict) -} + const result = await llmService.mergeSuggestion(type, suggestion) + + return result +} export async function mergeMembers( primaryMemberId: string, secondaryMemberId: string, diff --git a/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts b/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts index 48e448732c..895835abff 100644 --- a/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts +++ b/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts @@ -307,6 +307,7 @@ export async function getMembersForLLMConsumption( if (primaryMember) { result.push({ + id: primaryMember.id, displayName: primaryMember.displayName, joinedAt: primaryMember.joinedAt, attributes: primaryMember.attributes, @@ -323,6 +324,7 @@ export async function getMembersForLLMConsumption( if (secondaryMember) { result.push({ + id: secondaryMember.id, joinedAt: secondaryMember.joinedAt, displayName: secondaryMember.displayName, attributes: secondaryMember.attributes, diff --git a/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts b/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts index bec8f29746..99e9661bb1 100644 --- a/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts +++ b/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts @@ -358,6 +358,7 @@ async function prepareOrg( ]) return { + id: base.id, displayName: base.displayName, description: base.description, phoneNumbers: attributes.filter((a) => a.name === 'phoneNumber').map((a) => a.value), diff --git a/services/apps/merge_suggestions_worker/src/types.ts b/services/apps/merge_suggestions_worker/src/types.ts index 6a2f292660..dfc588f11a 100644 --- a/services/apps/merge_suggestions_worker/src/types.ts +++ b/services/apps/merge_suggestions_worker/src/types.ts @@ -43,31 +43,6 @@ export type IOrganizationFilter = | IRangeFilterCreatedAt | IExistsFilter -export interface ILLMResult { - body: ILLMBody - prompt: string - responseTimeSeconds: number - // eslint-disable-next-line @typescript-eslint/no-explicit-any - modelSpecificArgs: any -} - -export interface ILLMBody { - id: string - type: string - role: string - model: string - content: { - type: string - text: string - }[] - stop_reason: string - stop_sequence: string - usage: { - input_tokens: number - output_tokens: number - } -} - export interface IProcessGenerateMemberMergeSuggestionsArgs { tenantId: string lastUuid?: string diff --git a/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts b/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts index 65890d85a5..21dc427687 100644 --- a/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts +++ b/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts @@ -1,10 +1,10 @@ import { continueAsNew, proxyActivities } from '@temporalio/workflow' -import { LLMSuggestionVerdictType } from '@crowd/types' +import { LlmQueryType } from '@crowd/types' import * as commonActivities from '../activities/common' import * as memberActivities from '../activities/memberMergeSuggestions' -import { ILLMResult, IProcessMergeMemberSuggestionsWithLLM } from '../types' +import { IProcessMergeMemberSuggestionsWithLLM } from '../types' import { removeEmailLikeIdentitiesFromMember } from '../utils' const memberActivitiesProxy = proxyActivities({ @@ -23,31 +23,6 @@ export async function mergeMembersWithLLM( args: IProcessMergeMemberSuggestionsWithLLM, ): Promise { const SUGGESTIONS_PER_RUN = 10 - const REGION = 'us-west-2' - const MODEL_ID = 'anthropic.claude-3-opus-20240229-v1:0' - const MODEL_ARGS = { - max_tokens: 2000, - anthropic_version: 'bedrock-2023-05-31', - temperature: 0, - } - const PROMPT = `Please compare and come up with a boolean answer if these two members are the same person or not. - Only compare data from first member and second member. Never compare data from only one member with itself. - Never tokenize 'platform' field using character tokenization. Use word tokenization for platform field in identities. - You should check all the sent fields between members to find similarities both literally and semantically. - Here are the fields written with respect to their importance and how to check. Identities >> Organizations > Attributes and other fields >> Display name - - 1. Identities: Tokenize value field (identity.value) using character tokenization. Exact match or identities with edit distance <= 2 suggests that members are similar. - Don't compare identities in a single member. Only compare identities between members. - 2. Organizations: Members are more likely to be the same when they have/had roles in similar organizations. - If there are no intersecting organizations it doesn't necessarily mean that they're different members. - 3. Attributes and other fields: If one member have a specific field and other member doesn't, skip that field when deciding similarity. - Checking semantically instead of literally is important for such fields. Important fields here are: location, timezone, languages, programming languages. - For example one member might have Berlin in location, while other can have Germany - consider such members have same location. - 4. Display Name: Tokenize using both character and word tokenization. When the display name is more than one word, and the difference is a few edit distances consider it a strong indication of similarity. - When one display name is contained by the other, check other fields for the final decision. The same members on different platforms might have different display names. - Display names can be multiple words and might be sorted in different order in different platforms for the same member. - Pro tip: If members have identities in the same platform (member1.identities[x].platform === member2.identities[y].platform) and if these identities have different usernames(member1.identities[x].value !== member2.identities[y].value) you can label them as different. - Only do such labeling if both members have identities in the same platform. If they don't have identities in the same platform ignore the pro tip. - Print 'true' if they are the same member, 'false' otherwise. No explanation required. Don't print anything else.` const suggestions = await memberActivitiesProxy.getRawMemberMergeSuggestions( args.similarity, @@ -66,27 +41,12 @@ export async function mergeMembersWithLLM( continue } - const llmResult: ILLMResult = await commonActivitiesProxy.getLLMResult( + const verdict = await commonActivitiesProxy.getLLMResult( + LlmQueryType.MEMBER_MERGE, members.map((member) => removeEmailLikeIdentitiesFromMember(member)), - MODEL_ID, - PROMPT, - REGION, - MODEL_ARGS, ) - await commonActivitiesProxy.saveLLMVerdict({ - type: LLMSuggestionVerdictType.MEMBER, - model: MODEL_ID, - primaryId: suggestion[0], - secondaryId: suggestion[1], - prompt: llmResult.prompt, - responseTimeSeconds: llmResult.responseTimeSeconds, - inputTokenCount: llmResult.body.usage.input_tokens, - outputTokenCount: llmResult.body.usage.output_tokens, - verdict: llmResult.body.content[0].text, - }) - - if (llmResult.body.content[0].text === 'true') { + if (verdict) { await commonActivitiesProxy.mergeMembers(suggestion[0], suggestion[1], args.tenantId) } } diff --git a/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts b/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts index 4b2e5bc63a..75e9c55ebc 100644 --- a/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts +++ b/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts @@ -1,10 +1,10 @@ import { continueAsNew, proxyActivities } from '@temporalio/workflow' -import { LLMSuggestionVerdictType } from '@crowd/types' +import { LlmQueryType } from '@crowd/types' import * as commonActivities from '../activities/common' import * as organizationActivities from '../activities/organizationMergeSuggestions' -import { ILLMResult, IProcessMergeOrganizationSuggestionsWithLLM } from '../types' +import { IProcessMergeOrganizationSuggestionsWithLLM } from '../types' const organizationActivitiesProxy = proxyActivities({ startToCloseTimeout: '1 minute', @@ -22,14 +22,6 @@ export async function mergeOrganizationsWithLLM( args: IProcessMergeOrganizationSuggestionsWithLLM, ): Promise { const SUGGESTIONS_PER_RUN = 5 - const REGION = 'us-west-2' - const MODEL_ID = 'anthropic.claude-3-opus-20240229-v1:0' - const MODEL_ARGS = { - max_tokens: 2000, - anthropic_version: 'bedrock-2023-05-31', - temperature: 0, - } - const PROMPT = `Please compare and come up with a boolean answer if these two organizations are the same organization or not. Print 'true' if they are the same organization, 'false' otherwise. No explanation required. Don't print anything else.` const suggestions = await organizationActivitiesProxy.getRawOrganizationMergeSuggestions( args.tenantId, @@ -54,27 +46,12 @@ export async function mergeOrganizationsWithLLM( continue } - const llmResult: ILLMResult = await commonActivitiesProxy.getLLMResult( + const verdict = await commonActivitiesProxy.getLLMResult( + LlmQueryType.ORGANIZATION_MERGE, organizations, - MODEL_ID, - PROMPT, - REGION, - MODEL_ARGS, ) - await commonActivitiesProxy.saveLLMVerdict({ - type: LLMSuggestionVerdictType.ORGANIZATION, - model: MODEL_ID, - primaryId: suggestion[0], - secondaryId: suggestion[1], - prompt: llmResult.prompt, - responseTimeSeconds: llmResult.responseTimeSeconds, - inputTokenCount: llmResult.body.usage.input_tokens, - outputTokenCount: llmResult.body.usage.output_tokens, - verdict: llmResult.body.content[0].text, - }) - - if (llmResult.body.content[0].text === 'true') { + if (verdict) { console.log( `LLM verdict says these two orgs are the same. Merging organizations: ${suggestion[0]} and ${suggestion[1]}!`, ) diff --git a/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts b/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts index 71156f46c1..68c315f01b 100644 --- a/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts +++ b/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts @@ -1,9 +1,11 @@ import { proxyActivities } from '@temporalio/workflow' +import { LlmQueryType } from '@crowd/types' + import * as commonActivities from '../activities/common' import * as memberActivities from '../activities/memberMergeSuggestions' import * as organizationActivities from '../activities/organizationMergeSuggestions' -import { ILLMResult, IProcessCheckSimilarityWithLLM } from '../types' +import { IProcessCheckSimilarityWithLLM } from '../types' import { removeEmailLikeIdentitiesFromMember } from '../utils' const memberActivitiesProxy = proxyActivities({ @@ -39,16 +41,13 @@ export async function testMergingEntitiesWithLLM( continue } - const res: ILLMResult = await commonActivitiesProxy.getLLMResult( + const res = await commonActivitiesProxy.getLLMResult( + LlmQueryType.MEMBER_MERGE, members.map((member) => removeEmailLikeIdentitiesFromMember(member)), - args.modelId, - args.prompt, - args.region, - args.modelSpecificArgs, ) console.log(`Raw res: `) - console.log(res.body) - totalInputTokenCount += res.body.usage.input_tokens + console.log(res.answer) + totalInputTokenCount += res.inputTokenCount promptCount += 1 } } @@ -69,15 +68,12 @@ export async function testMergingEntitiesWithLLM( } const res = await commonActivitiesProxy.getLLMResult( + LlmQueryType.ORGANIZATION_MERGE, organizations, - args.modelId, - args.prompt, - args.region, - args.modelSpecificArgs, ) console.log(`Raw res: `) - console.log(res.body) - totalInputTokenCount += res.body.usage.input_tokens + console.log(res.answer) + totalInputTokenCount += res.inputTokenCount promptCount += 1 } } diff --git a/services/apps/premium/members_enrichment_worker/package.json b/services/apps/premium/members_enrichment_worker/package.json index 0c5acbdc5c..2bd7c8c344 100644 --- a/services/apps/premium/members_enrichment_worker/package.json +++ b/services/apps/premium/members_enrichment_worker/package.json @@ -6,6 +6,7 @@ "start:debug": "CROWD_TEMPORAL_TASKQUEUE=members-enrichment SERVICE=members-enrichment-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", "dev:local": "nodemon --watch src --watch ../../../libs --ext ts --exec pnpm run start:debug:local", "dev": "nodemon --watch src --watch ../../../libs --ext ts --exec pnpm run start:debug", + "script:onboarding": "SERVICE=script tsx --inspect src/bin/onboarding.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", @@ -16,6 +17,7 @@ "@crowd/archetype-worker": "workspace:*", "@crowd/common": "workspace:*", "@crowd/data-access-layer": "workspace:*", + "@crowd/common_services": "workspace:*", "@crowd/feature-flags": "workspace:*", "@crowd/integrations": "workspace:*", "@crowd/opensearch": "workspace:*", diff --git a/services/apps/premium/members_enrichment_worker/src/activities.ts b/services/apps/premium/members_enrichment_worker/src/activities.ts index 98d7c43e9d..94275acb52 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities.ts @@ -6,6 +6,7 @@ import { isCacheObsolete, isEnrichableBySource, normalizeEnrichmentData, + processMemberSources, touchMemberEnrichmentCacheUpdatedAt, updateMemberEnrichmentCache, } from './activities/enrichment' @@ -30,6 +31,7 @@ import { } from './activities/syncEnrichedData' export { + processMemberSources, getEnrichableMembers, getEnrichmentData, normalizeEnrichmentData, diff --git a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts index 2700e63e27..32f5d18ddf 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts @@ -1,5 +1,7 @@ +import { LlmService } from '@crowd/common_services' import { findMemberIdentityWithTheMostActivityInPlatform as findMemberIdentityWithTheMostActivityInPlatformQuestDb } from '@crowd/data-access-layer/src/activities' import { + fetchMemberDataForLLMSquashing, findMemberEnrichmentCacheDb, findMemberEnrichmentCacheForAllSourcesDb, insertMemberEnrichmentCacheDb, @@ -14,7 +16,7 @@ import { } from '@crowd/types' import { EnrichmentSourceServiceFactory } from '../factory' -import { svc } from '../main' +import { svc } from '../service' import { IEnrichmentSourceInput, IMemberEnrichmentData, @@ -95,10 +97,10 @@ export async function hasRemainingCredits(source: MemberEnrichmentSource): Promi } export async function findMemberEnrichmentCache( - source: MemberEnrichmentSource, + sources: MemberEnrichmentSource[], memberId: string, -): Promise> { - return findMemberEnrichmentCacheDb(svc.postgres.reader.connection(), memberId, source) +): Promise[]> { + return findMemberEnrichmentCacheDb(svc.postgres.reader.connection(), memberId, sources) } export async function findMemberEnrichmentCacheForAllSources( @@ -136,3 +138,127 @@ export async function findMemberIdentityWithTheMostActivityInPlatform( ): Promise { return findMemberIdentityWithTheMostActivityInPlatformQuestDb(svc.questdbSQL, memberId, platform) } + +export async function processMemberSources( + memberId: string, + sources: MemberEnrichmentSource[], +): Promise { + svc.log.debug({ memberId }, 'Processing member sources!') + + const toBeSquashed = {} + // const toBeSquashedContributions = {} + // find if there's already saved enrichment data in source + const caches = await findMemberEnrichmentCache(sources, memberId) + for (const source of sources) { + const cache = caches.find((c) => c.source === source) + if (cache && cache.data) { + const normalized = (await normalizeEnrichmentData( + source, + cache.data, + )) as IMemberEnrichmentDataNormalized + + // TODO uros temp remove contributions from sources to mitigate context size + // if (Array.isArray(normalized)) { + // const normalizedContributions = [] + // for (const n of normalized) { + // if (n.contributions) { + // normalizedContributions.push(n.contributions) + // delete n.contributions + // } + // } + + // toBeSquashedContributions[source] = normalizedContributions + // } else if (normalized.contributions) { + // toBeSquashedContributions[source] = normalized.contributions + // delete normalized.contributions + // } + + toBeSquashed[source] = normalized + } + } + + if (Object.keys(toBeSquashed).length > 1) { + const existingMemberData = await fetchMemberDataForLLMSquashing(svc.postgres.reader, memberId) + svc.log.info({ memberId }, 'Squashing data for member using LLM!') + + // TODO uros Implement data squasher using LLM & actual member entity enrichment logic + + const llmService = new LlmService( + svc.postgres.writer, + { + accessKeyId: process.env['CROWD_AWS_BEDROCK_ACCESS_KEY_ID'], + secretAccessKey: process.env['CROWD_AWS_BEDROCK_SECRET_ACCESS_KEY'], + }, + svc.log, + ) + + const prompt = ` + You are a data consolidation expert specializing in professional profile data. + Your task is to analyze and merge member data from an existing database with enriched data from multiple sources. + + EXISTING VERIFIED MEMBER DATA: + ${JSON.stringify(existingMemberData)} + + ENRICHED DATA FROM MULTIPLE SOURCES: + ${JSON.stringify(toBeSquashed)} + + Your task is to: + + 1. IDENTITY VERIFICATION + - Analyze all provided LinkedIn profiles across sources + - Mark LinkedIn identities as verified if: + * They match an existing verified LinkedIn identity, OR + * The same LinkedIn profile appears in 2+ independent sources + - Mark LinkedIn identities as unverified if: + * They appear in only one source, OR + * Different LinkedIn profiles are found for the same person + + 2. DATA CONFIDENCE ASSESSMENT + For each piece of enriched data, determine confidence level based on: + - Match with existing verified data + - Consistency across multiple sources + - Source reliability (verified identity source > unverified source) + - Supporting evidence from other identities (email, username patterns) + + 3. DATA CONSOLIDATION + Provide a consolidated profile with: + - displayName + - attributes (with sources) + - identities (with verification status) + - organizations (with sources) + + RULES: + 1. Prefer data from verified sources over unverified ones + 2. When conflicts exist, prefer data corroborated by multiple sources + 3. For organization histories, preserve all distinct positions with their sources + 4. Maintain provenance for each data point in attributes + 5. Flag any suspicious patterns that might indicate wrong person data + 6. For conflicting data points, include both with confidence indicators + 7. When merging organization data, verify organization identity matches across sources + + Please analyze the provided data and respond with your consolidated results. + + Format your response as a JSON object matching this structure: + { + "confidenceScore": number (0-1), + "consolidatedData": { + // Match EXISTING VERIFIED MEMBER DATA structure + }, + "reasoning": { + "identityVerification": string[], + "confidenceFactors": string[], + "conflicts": string[], + "recommendations": string[] + } + } + Answer with JSON only and nothing else. + ` + + const result = await llmService.consolidateMemberEnrichmentData(memberId, prompt) + this.log.info({ memberId }, 'LLM result') + } else { + svc.log.debug({ memberId }, 'No data to squash for member!') + } + + return false +} diff --git a/services/apps/premium/members_enrichment_worker/src/activities/getMembers.ts b/services/apps/premium/members_enrichment_worker/src/activities/getMembers.ts index 09698a6512..efd42bd382 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/getMembers.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/getMembers.ts @@ -6,7 +6,7 @@ import { } from '@crowd/types' import { EnrichmentSourceServiceFactory } from '../factory' -import { svc } from '../main' +import { svc } from '../service' export async function getEnrichableMembers( limit: number, diff --git a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/authenticateLFAuth0.ts b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/authenticateLFAuth0.ts index 3e45379e71..32f902e7c4 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/authenticateLFAuth0.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/authenticateLFAuth0.ts @@ -2,7 +2,7 @@ import { AuthenticationClient } from 'auth0' import { RedisCache } from '@crowd/redis' -import { svc } from '../../main' +import { svc } from '../../service' import { ITokenWithExpiration } from '../../sources/lfid/types' export async function refreshToken(): Promise { diff --git a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/enrichLFAuth0.ts b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/enrichLFAuth0.ts index df3b3b4bf9..13c52d421d 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/enrichLFAuth0.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/enrichLFAuth0.ts @@ -5,7 +5,7 @@ import { import { insertMemberIdentity } from '@crowd/data-access-layer/src/old/apps/premium/members_enrichment_worker/normalize' import { IAttributes, IMemberIdentity } from '@crowd/types' -import { svc } from '../../main' +import { svc } from '../../service' export async function getIdentitiesExistInOtherMembers( tenantId: string, diff --git a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getEnrichmentLFAuth0.ts b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getEnrichmentLFAuth0.ts index 43814e9be4..40442e6f2e 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getEnrichmentLFAuth0.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getEnrichmentLFAuth0.ts @@ -4,7 +4,7 @@ import { randomUUID } from 'crypto' import { RedisCache, acquireLock, releaseLock } from '@crowd/redis' import { IMember, MemberIdentityType, PlatformType } from '@crowd/types' -import { svc } from '../../main' +import { svc } from '../../service' import { IGetEnrichmentDataResponse } from '../../sources/lfid/types' // We'll keep the remaining rate limits in redisCache(lfx-auth0) diff --git a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getLFIDEnrichableMembers.ts b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getLFIDEnrichableMembers.ts index 61bb4d2620..70807f6d37 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getLFIDEnrichableMembers.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/getLFIDEnrichableMembers.ts @@ -1,7 +1,7 @@ import { fetchMembersForLFIDEnrichment } from '@crowd/data-access-layer/src/old/apps/premium/members_enrichment_worker' import { IMember } from '@crowd/types' -import { svc } from '../../main' +import { svc } from '../../service' export async function getLFIDEnrichableMembers(limit: number, afterId: string): Promise { let rows: IMember[] = [] diff --git a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/githubIdentities.ts b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/githubIdentities.ts index b5f58ebf28..bcfdfe544e 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/githubIdentities.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/lf-auth0/githubIdentities.ts @@ -6,7 +6,7 @@ import { GithubAPIResource, GithubTokenRotator } from '@crowd/integrations' import { RedisCache } from '@crowd/redis' import { IMemberIdentity } from '@crowd/types' -import { svc } from '../../main' +import { svc } from '../../service' import { IGithubUser } from '../../sources/lfid/types' export async function getGithubIdentitiesWithoutSourceId( diff --git a/services/apps/premium/members_enrichment_worker/src/activities/syncEnrichedData.ts b/services/apps/premium/members_enrichment_worker/src/activities/syncEnrichedData.ts index 6315799cab..b51028a993 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/syncEnrichedData.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/syncEnrichedData.ts @@ -1,7 +1,7 @@ import { DbStore } from '@crowd/data-access-layer/src/database' import { MemberSyncService, OrganizationSyncService } from '@crowd/opensearch' -import { svc } from '../main' +import { svc } from '../service' const syncMembers = new MemberSyncService( svc.redis, diff --git a/services/apps/premium/members_enrichment_worker/src/main.ts b/services/apps/premium/members_enrichment_worker/src/main.ts index 7603252158..70094b1d06 100644 --- a/services/apps/premium/members_enrichment_worker/src/main.ts +++ b/services/apps/premium/members_enrichment_worker/src/main.ts @@ -1,44 +1,7 @@ -import { Config } from '@crowd/archetype-standard' -import { Options, ServiceWorker } from '@crowd/archetype-worker' import { Edition } from '@crowd/types' import { scheduleMembersEnrichment, scheduleMembersLFIDEnrichment } from './schedules' - -const config: Config = { - envvars: [ - 'CROWD_ENRICHMENT_PROGAI_URL', - 'CROWD_ENRICHMENT_PROGAI_API_KEY', - 'CROWD_ENRICHMENT_CLEARBIT_URL', - 'CROWD_ENRICHMENT_CLEARBIT_API_KEY', - 'CROWD_ENRICHMENT_SERP_API_URL', - 'CROWD_ENRICHMENT_SERP_API_KEY', - 'CROWD_ENRICHMENT_CRUSTDATA_URL', - 'CROWD_ENRICHMENT_CRUSTDATA_API_KEY', - ], - producer: { - enabled: false, - }, - temporal: { - enabled: true, - }, - questdb: { - enabled: true, - }, - redis: { - enabled: true, - }, -} - -const options: Options = { - postgres: { - enabled: true, - }, - opensearch: { - enabled: true, - }, -} - -export const svc = new ServiceWorker(config, options) +import { svc } from './service' setImmediate(async () => { await svc.init() diff --git a/services/apps/premium/members_enrichment_worker/src/schedules/getMembersToEnrich.ts b/services/apps/premium/members_enrichment_worker/src/schedules/getMembersToEnrich.ts index e318fce93a..ff379e184b 100644 --- a/services/apps/premium/members_enrichment_worker/src/schedules/getMembersToEnrich.ts +++ b/services/apps/premium/members_enrichment_worker/src/schedules/getMembersToEnrich.ts @@ -2,7 +2,7 @@ import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/clien import { IS_DEV_ENV, IS_TEST_ENV } from '@crowd/common' -import { svc } from '../main' +import { svc } from '../service' import { getMembersForLFIDEnrichment, getMembersToEnrich } from '../workflows' export const scheduleMembersEnrichment = async () => { diff --git a/services/apps/premium/members_enrichment_worker/src/service.ts b/services/apps/premium/members_enrichment_worker/src/service.ts new file mode 100644 index 0000000000..0b93411645 --- /dev/null +++ b/services/apps/premium/members_enrichment_worker/src/service.ts @@ -0,0 +1,38 @@ +import { Config } from '@crowd/archetype-standard' +import { Options, ServiceWorker } from '@crowd/archetype-worker' + +const config: Config = { + envvars: [ + 'CROWD_ENRICHMENT_PROGAI_URL', + 'CROWD_ENRICHMENT_PROGAI_API_KEY', + 'CROWD_ENRICHMENT_CLEARBIT_URL', + 'CROWD_ENRICHMENT_CLEARBIT_API_KEY', + 'CROWD_ENRICHMENT_SERP_API_URL', + 'CROWD_ENRICHMENT_SERP_API_KEY', + 'CROWD_ENRICHMENT_CRUSTDATA_URL', + 'CROWD_ENRICHMENT_CRUSTDATA_API_KEY', + ], + producer: { + enabled: false, + }, + temporal: { + enabled: true, + }, + questdb: { + enabled: true, + }, + redis: { + enabled: true, + }, +} + +const options: Options = { + postgres: { + enabled: true, + }, + opensearch: { + enabled: true, + }, +} + +export const svc = new ServiceWorker(config, options) diff --git a/services/apps/premium/members_enrichment_worker/src/sources/crustdata/service.ts b/services/apps/premium/members_enrichment_worker/src/sources/crustdata/service.ts index 6e2e20808e..dd71b19140 100644 --- a/services/apps/premium/members_enrichment_worker/src/sources/crustdata/service.ts +++ b/services/apps/premium/members_enrichment_worker/src/sources/crustdata/service.ts @@ -279,7 +279,14 @@ export default class EnrichmentServiceCrustdata extends LoggerBase implements IE } if (data.email) { - for (const email of data.email.split(',').filter(isEmail)) { + let emails: string[] + if (Array.isArray(data.email)) { + emails = data.email + } else { + emails = data.email.split(',').filter(isEmail) + } + + for (const email of emails) { normalized.identities.push({ type: MemberIdentityType.EMAIL, platform: this.platform, diff --git a/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts b/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts index 5295043c7c..0f35355edd 100644 --- a/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts +++ b/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts @@ -18,8 +18,8 @@ const { touchMemberEnrichmentCacheUpdatedAt, updateMemberEnrichmentCache, isCacheObsolete, - normalizeEnrichmentData, findMemberIdentityWithTheMostActivityInPlatform, + processMemberSources, } = proxyActivities({ startToCloseTimeout: '20 seconds', retry: { @@ -38,7 +38,8 @@ export async function enrichMember( for (const source of sources) { // find if there's already saved enrichment data in source - const cache = await findMemberEnrichmentCache(source, input.id) + const caches = await findMemberEnrichmentCache([source], input.id) + const cache = caches.find((c) => c.source === source) // cache is obsolete when it's not found or cache.updatedAt is older than cacheObsoleteAfterSeconds if (await isCacheObsolete(source, cache)) { @@ -100,16 +101,6 @@ export async function enrichMember( if (changeInEnrichmentSourceData) { // Member enrichment data has been updated, use squasher again! - const toBeSquashed = {} - for (const source of sources) { - // find if there's already saved enrichment data in source - const cache = await findMemberEnrichmentCache(source, input.id) - if (cache && cache.data) { - const normalized = await normalizeEnrichmentData(source, cache.data) - toBeSquashed[source] = normalized - } - } - - // TODO:: Implement data squasher using LLM & actual member entity enrichment logic + await processMemberSources(input.id, sources) } } diff --git a/services/archetypes/worker/src/index.ts b/services/archetypes/worker/src/index.ts index a6c4fc8d85..6b647be16c 100644 --- a/services/archetypes/worker/src/index.ts +++ b/services/archetypes/worker/src/index.ts @@ -100,7 +100,7 @@ export class ServiceWorker extends Service { // We first need to ensure a standard service can be initialized given the config // and environment variables. - override async init() { + override async init(initWorker = true) { try { await super.init() } catch (err) { @@ -203,51 +203,53 @@ export class ServiceWorker extends Service { } } - try { - const certificate = process.env['CROWD_TEMPORAL_CERTIFICATE'] - const privateKey = process.env['CROWD_TEMPORAL_PRIVATE_KEY'] - - this.log.info( - { + if (initWorker) { + try { + const certificate = process.env['CROWD_TEMPORAL_CERTIFICATE'] + const privateKey = process.env['CROWD_TEMPORAL_PRIVATE_KEY'] + + this.log.info( + { + address: process.env['CROWD_TEMPORAL_SERVER_URL'], + certificate: certificate ? 'yes' : 'no', + privateKey: privateKey ? 'yes' : 'no', + }, + 'Connecting to Temporal server as a worker!', + ) + + const connection = await NativeConnection.connect({ address: process.env['CROWD_TEMPORAL_SERVER_URL'], - certificate: certificate ? 'yes' : 'no', - privateKey: privateKey ? 'yes' : 'no', - }, - 'Connecting to Temporal server as a worker!', - ) - - const connection = await NativeConnection.connect({ - address: process.env['CROWD_TEMPORAL_SERVER_URL'], - tls: - certificate && privateKey - ? { - clientCertPair: { - crt: Buffer.from(certificate, 'base64'), - key: Buffer.from(privateKey, 'base64'), - }, - } - : undefined, - }) + tls: + certificate && privateKey + ? { + clientCertPair: { + crt: Buffer.from(certificate, 'base64'), + key: Buffer.from(privateKey, 'base64'), + }, + } + : undefined, + }) - const workflowBundle = await bundleWorkflowCode({ - workflowsPath: path.resolve('./src/workflows'), - }) + const workflowBundle = await bundleWorkflowCode({ + workflowsPath: path.resolve('./src/workflows'), + }) - this._worker = await TemporalWorker.create({ - connection: connection, - identity: this.name, - namespace: process.env['CROWD_TEMPORAL_NAMESPACE'], - taskQueue: process.env['CROWD_TEMPORAL_TASKQUEUE'], - enableSDKTracing: true, - showStackTraceSources: true, - workflowBundle: workflowBundle, - activities: require(path.resolve('./src/activities')), - dataConverter: await getDataConverter(), - maxTaskQueueActivitiesPerSecond: this.options.maxTaskQueueActivitiesPerSecond, - maxConcurrentActivityTaskExecutions: this.options.maxConcurrentActivityTaskExecutions, - }) - } catch (err) { - throw new Error(err) + this._worker = await TemporalWorker.create({ + connection: connection, + identity: this.name, + namespace: process.env['CROWD_TEMPORAL_NAMESPACE'], + taskQueue: process.env['CROWD_TEMPORAL_TASKQUEUE'], + enableSDKTracing: true, + showStackTraceSources: true, + workflowBundle: workflowBundle, + activities: require(path.resolve('./src/activities')), + dataConverter: await getDataConverter(), + maxTaskQueueActivitiesPerSecond: this.options.maxTaskQueueActivitiesPerSecond, + maxConcurrentActivityTaskExecutions: this.options.maxConcurrentActivityTaskExecutions, + }) + } catch (err) { + throw new Error(err) + } } } diff --git a/services/libs/common_services/package.json b/services/libs/common_services/package.json index c6fa08a010..3585784a7f 100644 --- a/services/libs/common_services/package.json +++ b/services/libs/common_services/package.json @@ -12,6 +12,7 @@ "typescript": "^5.6.3" }, "dependencies": { + "@aws-sdk/client-bedrock-runtime": "^3.572.0", "@crowd/common": "workspace:*", "@crowd/database": "workspace:*", "@crowd/feature-flags": "workspace:*", diff --git a/services/libs/common_services/src/repos/llmPromptHistory.repo.ts b/services/libs/common_services/src/repos/llmPromptHistory.repo.ts new file mode 100644 index 0000000000..8f7a854713 --- /dev/null +++ b/services/libs/common_services/src/repos/llmPromptHistory.repo.ts @@ -0,0 +1,31 @@ +import { DbStore, RepositoryBase } from '@crowd/database' +import { Logger } from '@crowd/logging' +import { ILlmResponse, LlmModelType, LlmQueryType } from '@crowd/types' + +export class LlmPromptHistoryRepository extends RepositoryBase { + public constructor(dbStore: DbStore, parentLog: Logger) { + super(dbStore, parentLog) + } + + public async insertPromptHistoryEntry( + type: LlmQueryType, + model: LlmModelType, + result: ILlmResponse, + entityId?: string, + metadata?: Record, + ): Promise { + await this.db().none( + ` + insert into "llmPromptHistory"(type, model, "entityId", metadata, prompt, answer, "inputTokenCount", "outputTokenCount", "responseTimeSeconds") + values($(type), $(model), $(entityId), $(metadata), $(prompt), $(answer), $(inputTokenCount), $(outputTokenCount), $(responseTimeSeconds)); + `, + { + type, + model, + entityId, + metadata: metadata ? JSON.stringify(metadata) : null, + ...result, + }, + ) + } +} diff --git a/services/libs/common_services/src/services/index.ts b/services/libs/common_services/src/services/index.ts index 8967b647ab..1d23c2fe49 100644 --- a/services/libs/common_services/src/services/index.ts +++ b/services/libs/common_services/src/services/index.ts @@ -1,2 +1,3 @@ export * from './priority.service' +export * from './llm.service' export * from './emitters' diff --git a/services/libs/common_services/src/services/llm.service.ts b/services/libs/common_services/src/services/llm.service.ts new file mode 100644 index 0000000000..9599c83797 --- /dev/null +++ b/services/libs/common_services/src/services/llm.service.ts @@ -0,0 +1,211 @@ +import { + BedrockRuntimeClient, + InvokeModelCommand, + InvokeModelCommandOutput, +} from '@aws-sdk/client-bedrock-runtime' +import { performance } from 'perf_hooks' + +import { DbStore } from '@crowd/database' +import { Logger, LoggerBase } from '@crowd/logging' +import { + ILLMConsumableMember, + ILLMConsumableOrganization, + ILlmResponse, + ILlmResult, + ILlmSettings, + LLM_MODEL_PRICING_MAP, + LLM_MODEL_REGION_MAP, + LLM_SETTINGS, + LlmQueryType, +} from '@crowd/types' + +import { LlmPromptHistoryRepository } from '../repos/llmPromptHistory.repo' + +export interface IBedrockClientCredentials { + accessKeyId: string + secretAccessKey: string +} + +export class LlmService extends LoggerBase { + private readonly repo: LlmPromptHistoryRepository + private readonly clientRegionMap: Map + + public constructor( + store: DbStore, + private readonly bedrockCredentials: IBedrockClientCredentials, + parentLog: Logger, + ) { + super(parentLog) + + this.repo = new LlmPromptHistoryRepository(store, this.log) + this.clientRegionMap = new Map() + } + + private client(settings: ILlmSettings): BedrockRuntimeClient { + const region = LLM_MODEL_REGION_MAP[settings.modelId] + + let client: BedrockRuntimeClient + if (this.clientRegionMap.has(region)) { + client = this.clientRegionMap.get(region) + } else { + client = new BedrockRuntimeClient({ + credentials: { + accessKeyId: this.bedrockCredentials.accessKeyId, + secretAccessKey: this.bedrockCredentials.secretAccessKey, + }, + region, + }) + this.clientRegionMap.set(region, client) + } + + return client + } + + public async queryLlm( + type: LlmQueryType, + prompt: string, + entityId?: string, + metadata?: Record, + saveHistory = true, + ): Promise { + const settings = LLM_SETTINGS[type] + if (!settings) { + throw new Error(`No settings found for LLM query type: ${type}`) + } + + const client = this.client(settings) + + const start = performance.now() + const end = () => { + const end = performance.now() + const duration = end - start + return Math.ceil(duration / 1000) + } + + const command = new InvokeModelCommand({ + body: JSON.stringify({ + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: prompt, + }, + ], + }, + ], + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ...(settings.arguments as any), + }), + modelId: settings.modelId, + accept: 'application/json', + contentType: 'application/json', + }) + + let res: InvokeModelCommandOutput + try { + res = await client.send(command) + } catch (err) { + this.log.error(err, { settings, prompt }, 'Failed to query LLM!') + throw err + } + + const body = JSON.parse(res.body.transformToString()) + const responseTimeSeconds = end() + + const inputTokenCount = body.usage.input_tokens + const outputTokenCount = body.usage.output_tokens + const answer = body.content[0].text + + const pricing = LLM_MODEL_PRICING_MAP[settings.modelId] + + const inputCost = (inputTokenCount / 1000) * pricing.costPer1000InputTokens + const outputCost = (outputTokenCount / 1000) * pricing.costPer1000OutputTokens + const totalCost = inputCost + outputCost + + this.log.info({ type, entityId, inputCost, outputCost, totalCost }, 'Estimated LLM cost!') + + const result = { + prompt, + answer, + inputTokenCount, + outputTokenCount, + responseTimeSeconds, + model: settings.modelId, + } + + if (saveHistory) { + try { + await this.repo.insertPromptHistoryEntry(type, settings.modelId, result, entityId, metadata) + } catch (err) { + this.log.error(err, 'Failed to save LLM prompt history entry!') + throw err + } + } + + return result + } + + public async consolidateMemberEnrichmentData( + memberId: string, + prompt: string, + ): Promise> { + const response = await this.queryLlm(LlmQueryType.MEMBER_ENRICHMENT, prompt, memberId) + + const result = response.answer + + return { + result, + ...response, + } + } + + public async mergeSuggestion( + type: LlmQueryType.MEMBER_MERGE | LlmQueryType.ORGANIZATION_MERGE, + suggestion: ILLMConsumableMember[] | ILLMConsumableOrganization[], + ): Promise> { + if (suggestion.length !== 2) { + console.log(suggestion) + throw new Error('Exactly 2 entities are required for LLM comparison') + } + + const prompt = type === LlmQueryType.MEMBER_MERGE ? MEMBER_PROMPT : ORGANIZATION_PROMPT + + const fullPrompt = `Your task is to analyze the following two json documents. ${JSON.stringify( + suggestion, + )} . ${prompt}` + + const response = await this.queryLlm(type, fullPrompt, suggestion[0].id, { + secondaryId: suggestion[1].id, + }) + + const result = response.answer === 'true' + + return { + result, + ...response, + } + } +} + +const MEMBER_PROMPT = `Please compare and come up with a boolean answer if these two members are the same person or not. + Only compare data from first member and second member. Never compare data from only one member with itself. + Never tokenize 'platform' field using character tokenization. Use word tokenization for platform field in identities. + You should check all the sent fields between members to find similarities both literally and semantically. + Here are the fields written with respect to their importance and how to check. Identities >> Organizations > Attributes and other fields >> Display name - + 1. Identities: Tokenize value field (identity.value) using character tokenization. Exact match or identities with edit distance <= 2 suggests that members are similar. + Don't compare identities in a single member. Only compare identities between members. + 2. Organizations: Members are more likely to be the same when they have/had roles in similar organizations. + If there are no intersecting organizations it doesn't necessarily mean that they're different members. + 3. Attributes and other fields: If one member have a specific field and other member doesn't, skip that field when deciding similarity. + Checking semantically instead of literally is important for such fields. Important fields here are: location, timezone, languages, programming languages. + For example one member might have Berlin in location, while other can have Germany - consider such members have same location. + 4. Display Name: Tokenize using both character and word tokenization. When the display name is more than one word, and the difference is a few edit distances consider it a strong indication of similarity. + When one display name is contained by the other, check other fields for the final decision. The same members on different platforms might have different display names. + Display names can be multiple words and might be sorted in different order in different platforms for the same member. + Pro tip: If members have identities in the same platform (member1.identities[x].platform === member2.identities[y].platform) and if these identities have different usernames(member1.identities[x].value !== member2.identities[y].value) you can label them as different. + Only do such labeling if both members have identities in the same platform. If they don't have identities in the same platform ignore the pro tip. + Print 'true' if they are the same member, 'false' otherwise. No explanation required. Don't print anything else.` + +const ORGANIZATION_PROMPT = `Please compare and come up with a boolean answer if these two organizations are the same organization or not. Print 'true' if they are the same organization, 'false' otherwise. No explanation required. Don't print anything else.` diff --git a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts deleted file mode 100644 index a1ac25a3a2..0000000000 --- a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { randomUUID } from 'crypto' - -import { DbConnection, DbTransaction } from '@crowd/database' -import { Logger } from '@crowd/logging' -import { ILLMSuggestionVerdict } from '@crowd/types' - -class LLMSuggestionVerdictsRepository { - constructor( - private readonly connection: DbConnection | DbTransaction, - private readonly log: Logger, - ) {} - - async saveLLMVerdict(verdict: ILLMSuggestionVerdict): Promise { - const query = ` - insert into "llmSuggestionVerdicts" ("id", "type", "model", "primaryId", "secondaryId", "prompt", "verdict", "inputTokenCount", "outputTokenCount", "responseTimeSeconds", "createdAt") - values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, now()) - returning "id"; - ` - let result: { id: string } - - try { - result = await this.connection.one(query, [ - randomUUID(), - verdict.type, - verdict.model, - verdict.primaryId, - verdict.secondaryId, - verdict.prompt, - verdict.verdict, - verdict.inputTokenCount, - verdict.outputTokenCount, - verdict.responseTimeSeconds, - ]) - } catch (err) { - this.log.error(err) - throw new Error(err) - } - - return result.id - } -} - -export default LLMSuggestionVerdictsRepository diff --git a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts index fa7976f74f..89c9d7a403 100644 --- a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts +++ b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts @@ -3,7 +3,7 @@ import { Logger } from '@crowd/logging' import { ILLMConsumableMemberDbResult, IMemberMergeSuggestion, - LLMSuggestionVerdictType, + LlmQueryType, MemberMergeSuggestionTable, SuggestionType, } from '@crowd/types' @@ -212,6 +212,7 @@ class MemberMergeSuggestionsRepository { const result: ILLMConsumableMemberDbResult[] = await this.connection.manyOrNone( ` select + mem.id, mem.attributes, mem."displayName", mem."joinedAt", @@ -258,17 +259,17 @@ class MemberMergeSuggestionsRepository { const query = `select * from "memberToMergeRaw" mtmr where not exists ( - select 1 from "llmSuggestionVerdicts" lsv + select 1 from "llmPromptHistory" lsv where ( - lsv."primaryId" = mtmr."memberId" and - lsv."secondaryId" = mtmr."toMergeId" and - lsv.type = '${LLMSuggestionVerdictType.MEMBER}' + lsv."entityId" = mtmr."memberId" and + (lsv.metadata ->> 'secondaryId')::uuid = mtmr."toMergeId" and + lsv.type = '${LlmQueryType.MEMBER_MERGE}' ) or ( - lsv."primaryId" = mtmr."toMergeId" and - lsv."secondaryId" = mtmr."memberId" and - lsv.type = '${LLMSuggestionVerdictType.MEMBER}' + lsv."entityId" = mtmr."toMergeId" and + (lsv.metadata ->> 'secondaryId')::uuid = mtmr."memberId" and + lsv.type = '${LlmQueryType.MEMBER_MERGE}' ) ) ${similarityLTEFilter} diff --git a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts index 310f210470..f60593cf3c 100644 --- a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts +++ b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts @@ -2,7 +2,7 @@ import { DbConnection, DbTransaction } from '@crowd/database' import { Logger } from '@crowd/logging' import { IOrganizationMergeSuggestion, - LLMSuggestionVerdictType, + LlmQueryType, OrganizationMergeSuggestionTable, SuggestionType, } from '@crowd/types' @@ -205,7 +205,7 @@ class OrganizationMergeSuggestionsRepository { /** * We get raw (unfiltered) suggestions from the database. * When onlyLFXMembers is true it only returns suggestions for lfx member organizations. - * All returned suggestions are checked against the "llmSuggestionVerdicts" table to see if they have already been processed. + * All returned suggestions are checked against the "llmPromptHistory" table to see if they have already been processed. * Already processed suggestions will not be returned. * @param similarityFilter * @param limit @@ -254,17 +254,17 @@ class OrganizationMergeSuggestionsRepository { select distinct s."organizationId", s."toMergeId" from suggestions s where not exists ( - select 1 from "llmSuggestionVerdicts" lsv + select 1 from "llmPromptHistory" lsv where ( - lsv."primaryId" = s."organizationId" and - lsv."secondaryId" = s."toMergeId" and - lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' + lsv."entityId" = s."organizationId" and + (lsv.metadata ->> 'secondaryId')::uuid = s."toMergeId" and + lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' ) or ( - lsv."primaryId" = s."toMergeId" and - lsv."secondaryId" = s."organizationId" and - lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' + lsv."entityId" = s."toMergeId" and + (lsv.metadata ->> 'secondaryId')::uuid = s."organizationId" and + lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' ) ) @@ -274,17 +274,17 @@ class OrganizationMergeSuggestionsRepository { query = `select * from "organizationToMergeRaw" otmr where not exists ( - select 1 from "llmSuggestionVerdicts" lsv + select 1 from "llmPromptHistory" lsv where ( - lsv."primaryId" = otmr."organizationId" and - lsv."secondaryId" = otmr."toMergeId" and - lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' + lsv."entityId" = otmr."organizationId" and + (lsv.metadata ->> 'secondaryId')::uuid = otmr."toMergeId" and + lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' ) or ( - lsv."primaryId" = otmr."toMergeId" and - lsv."secondaryId" = otmr."organizationId" and - lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' + lsv."entityId" = otmr."toMergeId" and + (lsv.metadata ->> 'secondaryId')::uuid = otmr."organizationId" and + lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' ) ) ${similarityLTEFilter} diff --git a/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts b/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts index ec0df7b091..37a0a4ef3c 100644 --- a/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts +++ b/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts @@ -6,12 +6,67 @@ import { IMemberEnrichmentCache, IMemberEnrichmentSourceQueryInput, IMemberIdentity, + IMemberOriginalData, IOrganizationIdentity, MemberEnrichmentSource, MemberIdentityType, OrganizationSource, } from '@crowd/types' +export async function fetchMemberDataForLLMSquashing( + db: DbStore, + memberId: string, +): Promise { + const result = await db.connection().oneOrNone( + ` + with member_orgs as (select distinct mo."memberId", + mo."organizationId" as "orgId", + o."displayName" as "orgName", + mo.title as "jobTitle", + mo."dateStart", + mo."dateEnd", + mo.source + from "memberOrganizations" mo + inner join organizations o on mo."organizationId" = o.id + where mo."memberId" = $(memberId) + and mo."deletedAt" is null + and o."deletedAt" is null) + select m."displayName", + m.attributes, + m."manuallyChangedFields", + (select json_agg( + (select row_to_json(r) + from (select mi.type, + mi.platform, + mi.value) r) + ) + from "memberIdentities" mi + where mi."memberId" = m.id + and verified = true) as identities, + json_agg( + (select row_to_json(r) + from (select mo."orgId", + mo."orgName", + mo."jobTitle", + mo."dateStart", + mo."dateEnd", + mo.source) r) + ) as organizations + + from members m + left join member_orgs mo on mo."memberId" = m.id + where m.id = $(memberId) + and m."deletedAt" is null + group by m.id, m."displayName", m.attributes, m."manuallyChangedFields"; + `, + { + memberId, + }, + ) + + return result +} + export async function fetchMembersForEnrichment( db: DbStore, limit: number, @@ -469,6 +524,10 @@ export async function updateMemberAttributes( ) } +export async function resetMemberEnrichedAt(tx: DbConnOrTx, memberId: string): Promise { + await tx.none(`update members set "lastEnriched" = null where id = $(memberId);`, { memberId }) +} + export async function insertMemberEnrichmentCacheDb( tx: DbConnOrTx, data: T, @@ -476,11 +535,13 @@ export async function insertMemberEnrichmentCacheDb( source: MemberEnrichmentSource, ) { const dataSanitized = data ? redactNullByte(JSON.stringify(data)) : null - return tx.query( + const res = await tx.query( `INSERT INTO "memberEnrichmentCache" ("memberId", "data", "createdAt", "updatedAt", "source") VALUES ($1, $2, NOW(), NOW(), $3);`, [memberId, dataSanitized, source], ) + await resetMemberEnrichedAt(tx, memberId) + return res } export async function updateMemberEnrichmentCacheDb( @@ -490,7 +551,7 @@ export async function updateMemberEnrichmentCacheDb( source: MemberEnrichmentSource, ) { const dataSanitized = data ? redactNullByte(JSON.stringify(data)) : null - return tx.query( + const res = await tx.query( `UPDATE "memberEnrichmentCache" SET "updatedAt" = NOW(), @@ -498,6 +559,8 @@ export async function updateMemberEnrichmentCacheDb( WHERE "memberId" = $1 and source = $3;`, [memberId, dataSanitized, source], ) + await resetMemberEnrichedAt(tx, memberId) + return res } export async function touchMemberEnrichmentCacheUpdatedAtDb( @@ -516,20 +579,20 @@ export async function touchMemberEnrichmentCacheUpdatedAtDb( export async function findMemberEnrichmentCacheDb( tx: DbConnOrTx, memberId: string, - source: MemberEnrichmentSource, -): Promise> { - const result = await tx.oneOrNone( + sources: MemberEnrichmentSource[], +): Promise[]> { + const results = await tx.any( ` select * from "memberEnrichmentCache" where - source = $(source) + source in ($(sources:csv)) and "memberId" = $(memberId); `, - { source, memberId }, + { sources, memberId }, ) - return result ?? null + return results } export async function findMemberEnrichmentCacheForAllSourcesDb( diff --git a/services/libs/types/src/enums/activities.ts b/services/libs/types/src/enums/activities.ts deleted file mode 100644 index 09d3e80d26..0000000000 --- a/services/libs/types/src/enums/activities.ts +++ /dev/null @@ -1 +0,0 @@ -export const test2 = 1 diff --git a/services/libs/types/src/enums/index.ts b/services/libs/types/src/enums/index.ts index d46a868d60..0e4e1c3646 100644 --- a/services/libs/types/src/enums/index.ts +++ b/services/libs/types/src/enums/index.ts @@ -1,4 +1,3 @@ -export * from './activities' export * from './customViews' export * from './edition' export * from './entities' @@ -18,3 +17,4 @@ export * from './suggestions' export * from './exports' export * from './dataIssues' export * from './enrichment' +export * from './llm' diff --git a/services/libs/types/src/enums/llm.ts b/services/libs/types/src/enums/llm.ts new file mode 100644 index 0000000000..9649bb1954 --- /dev/null +++ b/services/libs/types/src/enums/llm.ts @@ -0,0 +1,10 @@ +export enum LlmModelType { + CLAUDE_3_5_SONNET = 'anthropic.claude-3-5-sonnet-20240620-v1:0', + CLAUDE_3_OPUS = 'anthropic.claude-3-opus-20240229-v1:0', +} + +export enum LlmQueryType { + MEMBER_MERGE = 'member_merge_suggestion', + ORGANIZATION_MERGE = 'organization_merge_suggestion', + MEMBER_ENRICHMENT = 'member_enrichment', +} diff --git a/services/libs/types/src/enums/merging.ts b/services/libs/types/src/enums/merging.ts index 9ca2460b16..3eb3a67d88 100644 --- a/services/libs/types/src/enums/merging.ts +++ b/services/libs/types/src/enums/merging.ts @@ -28,8 +28,3 @@ export enum MemberRoleUnmergeStrategy { SAME_MEMBER = 'same-member', SAME_ORGANIZATION = 'same-organization', } - -export enum LLMSuggestionVerdictType { - MEMBER = 'member', - ORGANIZATION = 'organization', -} diff --git a/services/libs/types/src/index.ts b/services/libs/types/src/index.ts index 93a56b0e65..a8ef7ee906 100644 --- a/services/libs/types/src/index.ts +++ b/services/libs/types/src/index.ts @@ -60,3 +60,5 @@ export * from './productAnalytics' export * from './dataIssues' export * from './premium' + +export * from './llm' diff --git a/services/libs/types/src/llm.ts b/services/libs/types/src/llm.ts new file mode 100644 index 0000000000..8bf991c125 --- /dev/null +++ b/services/libs/types/src/llm.ts @@ -0,0 +1,73 @@ +import { LlmModelType, LlmQueryType } from './enums' + +export interface ILlmHistoryEntry extends ILlmResponse { + type: LlmQueryType + entityId?: string + metadata?: Record +} + +export interface ILlmResponse { + model: LlmModelType + prompt: string + answer: string + inputTokenCount: number +} + +export interface ILlmResult extends ILlmResponse { + result: T +} + +export interface ILlmSettings { + modelId: LlmModelType + arguments: unknown +} + +export interface ILlmPricing { + costPer1000InputTokens: number + costPer1000OutputTokens: number +} + +export const LLM_MODEL_REGION_MAP: Record = { + [LlmModelType.CLAUDE_3_OPUS]: 'us-west-2', + [LlmModelType.CLAUDE_3_5_SONNET]: 'us-east-1', +} + +// to estimate costs - these numbers can change +export const LLM_MODEL_PRICING_MAP: Record = { + [LlmModelType.CLAUDE_3_OPUS]: { + costPer1000InputTokens: 0.015, + costPer1000OutputTokens: 0.075, + }, + [LlmModelType.CLAUDE_3_5_SONNET]: { + costPer1000InputTokens: 0.003, + costPer1000OutputTokens: 0.015, + }, +} + +export const LLM_SETTINGS: Record = { + [LlmQueryType.MEMBER_MERGE]: { + modelId: LlmModelType.CLAUDE_3_OPUS, + arguments: { + max_tokens: 2000, + anthropic_version: 'bedrock-2023-05-31', + temperature: 0, + }, + }, + [LlmQueryType.ORGANIZATION_MERGE]: { + modelId: LlmModelType.CLAUDE_3_OPUS, + arguments: { + max_tokens: 2000, + anthropic_version: 'bedrock-2023-05-31', + temperature: 0, + }, + }, + [LlmQueryType.MEMBER_ENRICHMENT]: { + modelId: LlmModelType.CLAUDE_3_5_SONNET, + arguments: { + // TODO uros check if this is ok + max_tokens: 200000, + anthropic_version: 'bedrock-2023-05-31', + temperature: 0, + }, + }, +} diff --git a/services/libs/types/src/members.ts b/services/libs/types/src/members.ts index ff58d79494..3b787c4cd1 100644 --- a/services/libs/types/src/members.ts +++ b/services/libs/types/src/members.ts @@ -144,6 +144,7 @@ export interface IMemberRenderFriendlyRole { } export interface ILLMConsumableMemberDbResult { + id: string displayName: string attributes: IAttributes joinedAt: string @@ -159,6 +160,7 @@ export interface ILLMConsumableMemberDbResult { } export interface ILLMConsumableMember { + id: string displayName: string attributes: IAttributes joinedAt: string diff --git a/services/libs/types/src/merging.ts b/services/libs/types/src/merging.ts index 3a974caf75..96a5289f6c 100644 --- a/services/libs/types/src/merging.ts +++ b/services/libs/types/src/merging.ts @@ -12,7 +12,6 @@ import { IOrganization, ITag, ITask, - LLMSuggestionVerdictType, MergeActionState, MergeActionStep, MergeActionType, @@ -93,17 +92,3 @@ export interface IOrganizationUnmergePreviewResult extends IOrganization { memberCount: number activityCount: number } - -export interface ILLMSuggestionVerdict { - id?: string - type: LLMSuggestionVerdictType - model: string - primaryId: string - secondaryId: string - prompt: string - verdict: string - inputTokenCount: number - outputTokenCount: number - responseTimeSeconds: number - createdAt?: string -} diff --git a/services/libs/types/src/organizations.ts b/services/libs/types/src/organizations.ts index 4f3b4c66fb..2462a285e0 100644 --- a/services/libs/types/src/organizations.ts +++ b/services/libs/types/src/organizations.ts @@ -166,6 +166,7 @@ export interface ILLMConsumableOrganizationDbResult { } export interface ILLMConsumableOrganization { + id: string displayName: string description: string phoneNumbers: string[] diff --git a/services/libs/types/src/premium/enrichment.ts b/services/libs/types/src/premium/enrichment.ts index e6754a3ee7..08ffc350fd 100644 --- a/services/libs/types/src/premium/enrichment.ts +++ b/services/libs/types/src/premium/enrichment.ts @@ -30,3 +30,25 @@ export interface IEnrichableMemberIdentityActivityAggregate { username: string platform: string } + +export interface IMemberOrganizationData { + orgId: string + orgName: string + jobTitle: string + dateStart: string + dateEnd: string + source: string +} + +export interface IMemberOriginalData { + // members table data + displayName: string + attributes: Record> + manuallyChangedFields: string[] + + // memberIdentities table data + identities: IMemberIdentity[] + + // memberOrganizations table data + organizations: IMemberOrganizationData[] +} From f4f3784ef36672939149ce344e58d059d1c043f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Wed, 13 Nov 2024 14:42:37 +0100 Subject: [PATCH 2/6] Improved prompt --- .../src/activities/enrichment.ts | 164 ++++++++++++------ 1 file changed, 107 insertions(+), 57 deletions(-) diff --git a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts index 32f5d18ddf..83db2ce5a4 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts @@ -193,69 +193,119 @@ export async function processMemberSources( ) const prompt = ` - You are a data consolidation expert specializing in professional profile data. - Your task is to analyze and merge member data from an existing database with enriched data from multiple sources. - - EXISTING VERIFIED MEMBER DATA: - ${JSON.stringify(existingMemberData)} - - ENRICHED DATA FROM MULTIPLE SOURCES: - ${JSON.stringify(toBeSquashed)} - - Your task is to: - - 1. IDENTITY VERIFICATION - - Analyze all provided LinkedIn profiles across sources - - Mark LinkedIn identities as verified if: - * They match an existing verified LinkedIn identity, OR - * The same LinkedIn profile appears in 2+ independent sources - - Mark LinkedIn identities as unverified if: - * They appear in only one source, OR - * Different LinkedIn profiles are found for the same person - - 2. DATA CONFIDENCE ASSESSMENT - For each piece of enriched data, determine confidence level based on: - - Match with existing verified data - - Consistency across multiple sources - - Source reliability (verified identity source > unverified source) - - Supporting evidence from other identities (email, username patterns) - - 3. DATA CONSOLIDATION - Provide a consolidated profile with: - - displayName - - attributes (with sources) - - identities (with verification status) - - organizations (with sources) - - RULES: - 1. Prefer data from verified sources over unverified ones - 2. When conflicts exist, prefer data corroborated by multiple sources - 3. For organization histories, preserve all distinct positions with their sources - 4. Maintain provenance for each data point in attributes - 5. Flag any suspicious patterns that might indicate wrong person data - 6. For conflicting data points, include both with confidence indicators - 7. When merging organization data, verify organization identity matches across sources - - Please analyze the provided data and respond with your consolidated results. - - Format your response as a JSON object matching this structure: - { - "confidenceScore": number (0-1), - "consolidatedData": { - // Match EXISTING VERIFIED MEMBER DATA structure +You are a data consolidation expert specializing in professional profile data. +Your task is to analyze and merge member data from an existing database with enriched data from multiple sources. + +EXISTING VERIFIED MEMBER DATA: +${JSON.stringify(existingMemberData)} + +ENRICHED DATA FROM MULTIPLE SOURCES: +${JSON.stringify(toBeSquashed)} + +Your task is to return ONLY THE CHANGES needed to update the existing member data. + +1. IDENTITY VERIFICATION RULES +- Mark LinkedIn identities as verified if: + * They match an existing verified LinkedIn identity, OR + * The same LinkedIn profile appears in 2+ independent sources +- Mark LinkedIn identities as unverified if: + * They appear in only one source, OR + * Different LinkedIn profiles are found for the same person + +2. DATA CONSOLIDATION RULES +- For identities: + * Update verification status of existing identities when appropriate + * Add new identities not present in existing data +- For attributes: + * Add new sources/values to existing attributes + * Create new attributes when not present in existing data + * Update 'default' value only when high confidence (e.g., verified LinkedIn data) +- For organizations: + * Match with existing organizations where possible using organization identities + * Create new organizations only when no match found + * Include source attribution + +Format your response as a JSON object matching this structure: +{ + "confidenceScore": number (0-1), + "changes": { + "displayName": string | null, // null if no change needed + "identityChanges": { + "updateExisting": [ // updates to existing identities + { + "type": string, + "value": string, + "platform": string, + "verified": boolean // new verification status + } + ], + "new": [ // completely new identities + { + "type": string, + "value": string, + "platform": string, + "verified": boolean + } + ] + }, + "attributeChanges": { + "updateExisting": { // updates to existing attributes + [attributeName: string]: { + "default"?: any, // include only if default value should change + [source: string]: any // only new sources to add + } }, - "reasoning": { - "identityVerification": string[], - "confidenceFactors": string[], - "conflicts": string[], - "recommendations": string[] + "new": { // completely new attributes + [attributeName: string]: { + "default": any, // required for new attributes + [source: string]: any + } } + }, + "organizationChanges": { + "newConnections": [ // new connections to existing organizations + { + "organizationId": string, + "title": string, + "dateStart": string, + "dateEnd": string, + "source": string + } + ], + "newOrganizations": [ // completely new organizations to create + { + "name": string, + "identities": [ + { + "type": string, + "value": string, + "platform": string, + "verified": boolean + } + ], + "connection": { + "title": string, + "dateStart": string, + "dateEnd": string, + "source": string + } + } + ] } - Answer with JSON only and nothing else. + }, + "reasoning": { + "identityVerification": string[], + "confidenceFactors": string[], + "conflicts": string[], + "recommendations": string[] + } +} + +Answer with JSON only and nothing else. ` const result = await llmService.consolidateMemberEnrichmentData(memberId, prompt) - this.log.info({ memberId }, 'LLM result') + svc.log.info({ memberId }, 'LLM result') } else { svc.log.debug({ memberId }, 'No data to squash for member!') } From e25549867628af730b5878e2a06e3d8265a0e3e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Thu, 14 Nov 2024 12:51:45 +0100 Subject: [PATCH 3/6] wip --- .../V1731052735__llm-prompt-history.sql | 47 +----- .../merge_suggestions_worker/package.json | 1 - .../src/activities.ts | 9 +- .../src/activities/common.ts | 78 +++++++-- .../src/activities/memberMergeSuggestions.ts | 2 - .../organizationMergeSuggestions.ts | 1 - .../merge_suggestions_worker/src/types.ts | 25 +++ .../src/workflows/mergeMembersWithLLM.ts | 50 +++++- .../workflows/mergeOrganizationsWithLLM.ts | 33 +++- .../workflows/testMergingEntitiesWithLLM.ts | 24 +-- .../src/activities/enrichment.ts | 159 ++++++++++-------- .../src/bin/onboarding.ts | 76 +++++++++ .../members_enrichment_worker/src/types.ts | 2 + .../src/services/llm.service.ts | 56 +----- .../llmSuggestionVerdicts.repo.ts | 43 +++++ .../memberMergeSuggestions.repo.ts | 17 +- .../organizationMergeSuggestions.repo.ts | 32 ++-- .../members_enrichment_worker/index.ts | 16 +- services/libs/types/src/enums/llm.ts | 2 - services/libs/types/src/enums/merging.ts | 5 + services/libs/types/src/llm.ts | 73 ++++++-- services/libs/types/src/members.ts | 2 - services/libs/types/src/merging.ts | 15 ++ services/libs/types/src/organizations.ts | 1 - 24 files changed, 516 insertions(+), 253 deletions(-) create mode 100644 services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts create mode 100644 services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts diff --git a/backend/src/database/migrations/V1731052735__llm-prompt-history.sql b/backend/src/database/migrations/V1731052735__llm-prompt-history.sql index 19964fb7c3..41a3a9adba 100644 --- a/backend/src/database/migrations/V1731052735__llm-prompt-history.sql +++ b/backend/src/database/migrations/V1731052735__llm-prompt-history.sql @@ -14,49 +14,4 @@ create table "llmPromptHistory" ( create index "ix_llmPromptHistory_type_entityId" on "llmPromptHistory"("type", "entityId"); create index "ix_llmPromptHistory_entityId" on "llmPromptHistory"("entityId"); -create index "ix_llmPromptHistory_type" on "llmPromptHistory"("type"); -create index "ix_llmPromptHistory_secondaryId" on "llmPromptHistory" (((metadata->>'secondaryId')::uuid)) where type in ('organization_merge_suggestion', 'member_merge_suggestion'); - -insert into "llmPromptHistory"(type, model, "entityId", metadata, prompt, answer, "inputTokenCount", "outputTokenCount", "responseTimeSeconds") -select 'organization_merge_suggestion', - model, - "primaryId", - json_build_object( - 'secondaryId', "secondaryId" - ), - prompt, - verdict, - "inputTokenCount", - "outputTokenCount", - "responseTimeSeconds" -from "llmSuggestionVerdicts" -where type = 'organization'; - -delete from "llmSuggestionVerdicts" where type = 'organization'; - -insert into "llmPromptHistory"(type, model, "entityId", metadata, prompt, answer, "inputTokenCount", "outputTokenCount", "responseTimeSeconds") -select 'member_merge_suggestion', - model, - "primaryId", - json_build_object( - 'secondaryId', "secondaryId" - ), - prompt, - verdict, - "inputTokenCount", - "outputTokenCount", - "responseTimeSeconds" -from "llmSuggestionVerdicts" -where type = 'member'; - -delete from "llmSuggestionVerdicts" where type = 'member'; - -do -$$ - begin - if (select count(*) from "llmSuggestionVerdicts") > 0 then - raise exception 'Table llmSuggestionVerdicts is not empty - contains % rows', (select count(*) from "llmSuggestionVerdicts"); - end if; - drop table "llmSuggestionVerdicts"; - end -$$; \ No newline at end of file +create index "ix_llmPromptHistory_type" on "llmPromptHistory"("type"); \ No newline at end of file diff --git a/services/apps/merge_suggestions_worker/package.json b/services/apps/merge_suggestions_worker/package.json index 3ca639f251..94198d4ea4 100644 --- a/services/apps/merge_suggestions_worker/package.json +++ b/services/apps/merge_suggestions_worker/package.json @@ -16,7 +16,6 @@ "@crowd/archetype-standard": "workspace:*", "@crowd/archetype-worker": "workspace:*", "@crowd/common": "workspace:*", - "@crowd/common_services": "workspace:*", "@crowd/data-access-layer": "workspace:*", "@crowd/feature-flags": "workspace:*", "@crowd/logging": "workspace:*", diff --git a/services/apps/merge_suggestions_worker/src/activities.ts b/services/apps/merge_suggestions_worker/src/activities.ts index 3f26286967..fe3cde744d 100644 --- a/services/apps/merge_suggestions_worker/src/activities.ts +++ b/services/apps/merge_suggestions_worker/src/activities.ts @@ -1,4 +1,10 @@ -import { getAllTenants, getLLMResult, mergeMembers, mergeOrganizations } from './activities/common' +import { + getAllTenants, + getLLMResult, + mergeMembers, + mergeOrganizations, + saveLLMVerdict, +} from './activities/common' import { addMemberToMerge, findTenantsLatestMemberSuggestionGeneratedAt, @@ -35,6 +41,7 @@ export { getOrganizationsForLLMConsumption, getRawOrganizationMergeSuggestions, getRawMemberMergeSuggestions, + saveLLMVerdict, mergeMembers, mergeOrganizations, } diff --git a/services/apps/merge_suggestions_worker/src/activities/common.ts b/services/apps/merge_suggestions_worker/src/activities/common.ts index 8bf077ae62..c55cfdb9e4 100644 --- a/services/apps/merge_suggestions_worker/src/activities/common.ts +++ b/services/apps/merge_suggestions_worker/src/activities/common.ts @@ -1,19 +1,21 @@ /* eslint-disable @typescript-eslint/no-explicit-any */ +import { BedrockRuntimeClient, InvokeModelCommand } from '@aws-sdk/client-bedrock-runtime' import axios from 'axios' +import { performance } from 'perf_hooks' -import { LlmService } from '@crowd/common_services' import { ITenant } from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker//types' +import LLMSuggestionVerdictsRepository from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo' import TenantRepository from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker/tenant.repo' import { isFeatureEnabled } from '@crowd/feature-flags' import { FeatureFlag, ILLMConsumableMember, ILLMConsumableOrganization, - ILlmResult, - LlmQueryType, + ILLMSuggestionVerdict, } from '@crowd/types' import { svc } from '../main' +import { ILLMResult } from '../types' export async function getAllTenants(): Promise { const tenantRepository = new TenantRepository(svc.postgres.writer.connection(), svc.log) @@ -41,22 +43,74 @@ export async function getAllTenants(): Promise { } export async function getLLMResult( - type: LlmQueryType.MEMBER_MERGE | LlmQueryType.ORGANIZATION_MERGE, suggestion: ILLMConsumableMember[] | ILLMConsumableOrganization[], -): Promise> { - const llmService = new LlmService( - svc.postgres.writer, - { + modelId: string, + prompt: string, + region: string, + modelSpecificArgs: any, +): Promise { + if (suggestion.length !== 2) { + console.log(suggestion) + throw new Error('Exactly 2 entities are required for LLM comparison') + } + const client = new BedrockRuntimeClient({ + credentials: { accessKeyId: process.env['CROWD_AWS_BEDROCK_ACCESS_KEY_ID'], secretAccessKey: process.env['CROWD_AWS_BEDROCK_SECRET_ACCESS_KEY'], }, - svc.log, - ) + region, + }) + + const start = performance.now() + + const end = () => { + const end = performance.now() + const duration = end - start + return Math.ceil(duration / 1000) + } + + const fullPrompt = `Your task is to analyze the following two json documents. ${JSON.stringify( + suggestion, + )} . ${prompt}` - const result = await llmService.mergeSuggestion(type, suggestion) + const command = new InvokeModelCommand({ + body: JSON.stringify({ + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: fullPrompt, + }, + ], + }, + ], + ...modelSpecificArgs, + }), + modelId, + accept: 'application/json', + contentType: 'application/json', + }) - return result + const res = await client.send(command) + + return { + body: JSON.parse(res.body.transformToString()), + prompt: fullPrompt, + modelSpecificArgs, + responseTimeSeconds: end(), + } } + +export async function saveLLMVerdict(verdict: ILLMSuggestionVerdict): Promise { + const llmVerdictRepository = new LLMSuggestionVerdictsRepository( + svc.postgres.writer.connection(), + svc.log, + ) + return llmVerdictRepository.saveLLMVerdict(verdict) +} + export async function mergeMembers( primaryMemberId: string, secondaryMemberId: string, diff --git a/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts b/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts index 895835abff..48e448732c 100644 --- a/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts +++ b/services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts @@ -307,7 +307,6 @@ export async function getMembersForLLMConsumption( if (primaryMember) { result.push({ - id: primaryMember.id, displayName: primaryMember.displayName, joinedAt: primaryMember.joinedAt, attributes: primaryMember.attributes, @@ -324,7 +323,6 @@ export async function getMembersForLLMConsumption( if (secondaryMember) { result.push({ - id: secondaryMember.id, joinedAt: secondaryMember.joinedAt, displayName: secondaryMember.displayName, attributes: secondaryMember.attributes, diff --git a/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts b/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts index 99e9661bb1..bec8f29746 100644 --- a/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts +++ b/services/apps/merge_suggestions_worker/src/activities/organizationMergeSuggestions.ts @@ -358,7 +358,6 @@ async function prepareOrg( ]) return { - id: base.id, displayName: base.displayName, description: base.description, phoneNumbers: attributes.filter((a) => a.name === 'phoneNumber').map((a) => a.value), diff --git a/services/apps/merge_suggestions_worker/src/types.ts b/services/apps/merge_suggestions_worker/src/types.ts index dfc588f11a..6a2f292660 100644 --- a/services/apps/merge_suggestions_worker/src/types.ts +++ b/services/apps/merge_suggestions_worker/src/types.ts @@ -43,6 +43,31 @@ export type IOrganizationFilter = | IRangeFilterCreatedAt | IExistsFilter +export interface ILLMResult { + body: ILLMBody + prompt: string + responseTimeSeconds: number + // eslint-disable-next-line @typescript-eslint/no-explicit-any + modelSpecificArgs: any +} + +export interface ILLMBody { + id: string + type: string + role: string + model: string + content: { + type: string + text: string + }[] + stop_reason: string + stop_sequence: string + usage: { + input_tokens: number + output_tokens: number + } +} + export interface IProcessGenerateMemberMergeSuggestionsArgs { tenantId: string lastUuid?: string diff --git a/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts b/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts index 21dc427687..65890d85a5 100644 --- a/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts +++ b/services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts @@ -1,10 +1,10 @@ import { continueAsNew, proxyActivities } from '@temporalio/workflow' -import { LlmQueryType } from '@crowd/types' +import { LLMSuggestionVerdictType } from '@crowd/types' import * as commonActivities from '../activities/common' import * as memberActivities from '../activities/memberMergeSuggestions' -import { IProcessMergeMemberSuggestionsWithLLM } from '../types' +import { ILLMResult, IProcessMergeMemberSuggestionsWithLLM } from '../types' import { removeEmailLikeIdentitiesFromMember } from '../utils' const memberActivitiesProxy = proxyActivities({ @@ -23,6 +23,31 @@ export async function mergeMembersWithLLM( args: IProcessMergeMemberSuggestionsWithLLM, ): Promise { const SUGGESTIONS_PER_RUN = 10 + const REGION = 'us-west-2' + const MODEL_ID = 'anthropic.claude-3-opus-20240229-v1:0' + const MODEL_ARGS = { + max_tokens: 2000, + anthropic_version: 'bedrock-2023-05-31', + temperature: 0, + } + const PROMPT = `Please compare and come up with a boolean answer if these two members are the same person or not. + Only compare data from first member and second member. Never compare data from only one member with itself. + Never tokenize 'platform' field using character tokenization. Use word tokenization for platform field in identities. + You should check all the sent fields between members to find similarities both literally and semantically. + Here are the fields written with respect to their importance and how to check. Identities >> Organizations > Attributes and other fields >> Display name - + 1. Identities: Tokenize value field (identity.value) using character tokenization. Exact match or identities with edit distance <= 2 suggests that members are similar. + Don't compare identities in a single member. Only compare identities between members. + 2. Organizations: Members are more likely to be the same when they have/had roles in similar organizations. + If there are no intersecting organizations it doesn't necessarily mean that they're different members. + 3. Attributes and other fields: If one member have a specific field and other member doesn't, skip that field when deciding similarity. + Checking semantically instead of literally is important for such fields. Important fields here are: location, timezone, languages, programming languages. + For example one member might have Berlin in location, while other can have Germany - consider such members have same location. + 4. Display Name: Tokenize using both character and word tokenization. When the display name is more than one word, and the difference is a few edit distances consider it a strong indication of similarity. + When one display name is contained by the other, check other fields for the final decision. The same members on different platforms might have different display names. + Display names can be multiple words and might be sorted in different order in different platforms for the same member. + Pro tip: If members have identities in the same platform (member1.identities[x].platform === member2.identities[y].platform) and if these identities have different usernames(member1.identities[x].value !== member2.identities[y].value) you can label them as different. + Only do such labeling if both members have identities in the same platform. If they don't have identities in the same platform ignore the pro tip. + Print 'true' if they are the same member, 'false' otherwise. No explanation required. Don't print anything else.` const suggestions = await memberActivitiesProxy.getRawMemberMergeSuggestions( args.similarity, @@ -41,12 +66,27 @@ export async function mergeMembersWithLLM( continue } - const verdict = await commonActivitiesProxy.getLLMResult( - LlmQueryType.MEMBER_MERGE, + const llmResult: ILLMResult = await commonActivitiesProxy.getLLMResult( members.map((member) => removeEmailLikeIdentitiesFromMember(member)), + MODEL_ID, + PROMPT, + REGION, + MODEL_ARGS, ) - if (verdict) { + await commonActivitiesProxy.saveLLMVerdict({ + type: LLMSuggestionVerdictType.MEMBER, + model: MODEL_ID, + primaryId: suggestion[0], + secondaryId: suggestion[1], + prompt: llmResult.prompt, + responseTimeSeconds: llmResult.responseTimeSeconds, + inputTokenCount: llmResult.body.usage.input_tokens, + outputTokenCount: llmResult.body.usage.output_tokens, + verdict: llmResult.body.content[0].text, + }) + + if (llmResult.body.content[0].text === 'true') { await commonActivitiesProxy.mergeMembers(suggestion[0], suggestion[1], args.tenantId) } } diff --git a/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts b/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts index 75e9c55ebc..4b2e5bc63a 100644 --- a/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts +++ b/services/apps/merge_suggestions_worker/src/workflows/mergeOrganizationsWithLLM.ts @@ -1,10 +1,10 @@ import { continueAsNew, proxyActivities } from '@temporalio/workflow' -import { LlmQueryType } from '@crowd/types' +import { LLMSuggestionVerdictType } from '@crowd/types' import * as commonActivities from '../activities/common' import * as organizationActivities from '../activities/organizationMergeSuggestions' -import { IProcessMergeOrganizationSuggestionsWithLLM } from '../types' +import { ILLMResult, IProcessMergeOrganizationSuggestionsWithLLM } from '../types' const organizationActivitiesProxy = proxyActivities({ startToCloseTimeout: '1 minute', @@ -22,6 +22,14 @@ export async function mergeOrganizationsWithLLM( args: IProcessMergeOrganizationSuggestionsWithLLM, ): Promise { const SUGGESTIONS_PER_RUN = 5 + const REGION = 'us-west-2' + const MODEL_ID = 'anthropic.claude-3-opus-20240229-v1:0' + const MODEL_ARGS = { + max_tokens: 2000, + anthropic_version: 'bedrock-2023-05-31', + temperature: 0, + } + const PROMPT = `Please compare and come up with a boolean answer if these two organizations are the same organization or not. Print 'true' if they are the same organization, 'false' otherwise. No explanation required. Don't print anything else.` const suggestions = await organizationActivitiesProxy.getRawOrganizationMergeSuggestions( args.tenantId, @@ -46,12 +54,27 @@ export async function mergeOrganizationsWithLLM( continue } - const verdict = await commonActivitiesProxy.getLLMResult( - LlmQueryType.ORGANIZATION_MERGE, + const llmResult: ILLMResult = await commonActivitiesProxy.getLLMResult( organizations, + MODEL_ID, + PROMPT, + REGION, + MODEL_ARGS, ) - if (verdict) { + await commonActivitiesProxy.saveLLMVerdict({ + type: LLMSuggestionVerdictType.ORGANIZATION, + model: MODEL_ID, + primaryId: suggestion[0], + secondaryId: suggestion[1], + prompt: llmResult.prompt, + responseTimeSeconds: llmResult.responseTimeSeconds, + inputTokenCount: llmResult.body.usage.input_tokens, + outputTokenCount: llmResult.body.usage.output_tokens, + verdict: llmResult.body.content[0].text, + }) + + if (llmResult.body.content[0].text === 'true') { console.log( `LLM verdict says these two orgs are the same. Merging organizations: ${suggestion[0]} and ${suggestion[1]}!`, ) diff --git a/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts b/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts index 68c315f01b..71156f46c1 100644 --- a/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts +++ b/services/apps/merge_suggestions_worker/src/workflows/testMergingEntitiesWithLLM.ts @@ -1,11 +1,9 @@ import { proxyActivities } from '@temporalio/workflow' -import { LlmQueryType } from '@crowd/types' - import * as commonActivities from '../activities/common' import * as memberActivities from '../activities/memberMergeSuggestions' import * as organizationActivities from '../activities/organizationMergeSuggestions' -import { IProcessCheckSimilarityWithLLM } from '../types' +import { ILLMResult, IProcessCheckSimilarityWithLLM } from '../types' import { removeEmailLikeIdentitiesFromMember } from '../utils' const memberActivitiesProxy = proxyActivities({ @@ -41,13 +39,16 @@ export async function testMergingEntitiesWithLLM( continue } - const res = await commonActivitiesProxy.getLLMResult( - LlmQueryType.MEMBER_MERGE, + const res: ILLMResult = await commonActivitiesProxy.getLLMResult( members.map((member) => removeEmailLikeIdentitiesFromMember(member)), + args.modelId, + args.prompt, + args.region, + args.modelSpecificArgs, ) console.log(`Raw res: `) - console.log(res.answer) - totalInputTokenCount += res.inputTokenCount + console.log(res.body) + totalInputTokenCount += res.body.usage.input_tokens promptCount += 1 } } @@ -68,12 +69,15 @@ export async function testMergingEntitiesWithLLM( } const res = await commonActivitiesProxy.getLLMResult( - LlmQueryType.ORGANIZATION_MERGE, organizations, + args.modelId, + args.prompt, + args.region, + args.modelSpecificArgs, ) console.log(`Raw res: `) - console.log(res.answer) - totalInputTokenCount += res.inputTokenCount + console.log(res.body) + totalInputTokenCount += res.body.usage.input_tokens promptCount += 1 } } diff --git a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts index 83db2ce5a4..3c3163aacd 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts @@ -146,7 +146,8 @@ export async function processMemberSources( svc.log.debug({ memberId }, 'Processing member sources!') const toBeSquashed = {} - // const toBeSquashedContributions = {} + const toBeSquashedContributions = {} + // find if there's already saved enrichment data in source const caches = await findMemberEnrichmentCache(sources, memberId) for (const source of sources) { @@ -158,20 +159,30 @@ export async function processMemberSources( )) as IMemberEnrichmentDataNormalized // TODO uros temp remove contributions from sources to mitigate context size - // if (Array.isArray(normalized)) { - // const normalizedContributions = [] - // for (const n of normalized) { - // if (n.contributions) { - // normalizedContributions.push(n.contributions) - // delete n.contributions - // } - // } - - // toBeSquashedContributions[source] = normalizedContributions - // } else if (normalized.contributions) { - // toBeSquashedContributions[source] = normalized.contributions - // delete normalized.contributions - // } + if (Array.isArray(normalized)) { + const normalizedContributions = [] + for (const n of normalized) { + if (n.contributions) { + normalizedContributions.push(n.contributions) + delete n.contributions + } + + if (n.reach) { + delete n.reach + } + } + + toBeSquashedContributions[source] = normalizedContributions + } + + if (normalized.contributions) { + toBeSquashedContributions[source] = normalized.contributions + delete normalized.contributions + } + + if (normalized.reach) { + delete normalized.reach + } toBeSquashed[source] = normalized } @@ -196,10 +207,10 @@ export async function processMemberSources( You are a data consolidation expert specializing in professional profile data. Your task is to analyze and merge member data from an existing database with enriched data from multiple sources. -EXISTING VERIFIED MEMBER DATA: +EXISTING_VERIFIED_MEMBER_DATA: ${JSON.stringify(existingMemberData)} -ENRICHED DATA FROM MULTIPLE SOURCES: +ENRICHED_DATA: ${JSON.stringify(toBeSquashed)} Your task is to return ONLY THE CHANGES needed to update the existing member data. @@ -214,42 +225,41 @@ Your task is to return ONLY THE CHANGES needed to update the existing member dat 2. DATA CONSOLIDATION RULES - For identities: - * Update verification status of existing identities when appropriate - * Add new identities not present in existing data + * Only include highest confidence identities (verified or multi-source) + * Prioritize professional identities (LinkedIn, GitHub) over social ones - For attributes: - * Add new sources/values to existing attributes - * Create new attributes when not present in existing data - * Update 'default' value only when high confidence (e.g., verified LinkedIn data) + * Only include attributes with clear evidence from multiple sources + * Prioritize professional attributes (title, location, skills) over others - For organizations: - * Match with existing organizations where possible using organization identities - * Create new organizations only when no match found - * Include source attribution + * Sort by dateStart descending and include most recent first + * Only include organizations with strong evidence of connection + * Stop adding organizations if response size getting too large Format your response as a JSON object matching this structure: { - "confidenceScore": number (0-1), + "confidence": number (0-1), "changes": { - "displayName": string | null, // null if no change needed - "identityChanges": { + "displayName": string, + "identities": { "updateExisting": [ // updates to existing identities { - "type": string, - "value": string, - "platform": string, - "verified": boolean // new verification status + "t": string, // for type + "v": string, // for value + "p": string, // for platform + "ve": boolean // new verification status } ], "new": [ // completely new identities { - "type": string, - "value": string, - "platform": string, - "verified": boolean + "t": string, // for type + "v": string, // for value + "p": string, // for platform + "ve": boolean // new verification status } ] }, - "attributeChanges": { - "updateExisting": { // updates to existing attributes + "attributes": { + "update": { // updates to existing attributes [attributeName: string]: { "default"?: any, // include only if default value should change [source: string]: any // only new sources to add @@ -262,50 +272,67 @@ Format your response as a JSON object matching this structure: } } }, - "organizationChanges": { - "newConnections": [ // new connections to existing organizations + "organizations": { + "newConns": [ // new connections to existing organizations { - "organizationId": string, - "title": string, - "dateStart": string, - "dateEnd": string, - "source": string + "orgId": string, // for organizationId - should match one of the UUIDs of orgs from EXISTING_VERIFIED_MEMBER_DATA + "t": string, // for title + "ds": string, // for dateStart + "de": string, // for dateEnd + "s": string // for source } ], - "newOrganizations": [ // completely new organizations to create + "newOrgs": [ // completely new organizations to create { - "name": string, - "identities": [ + "n": string, // for org name + "i": [ // identities { - "type": string, - "value": string, - "platform": string, - "verified": boolean + "t": string, // for type + "v": string, // for value + "p": string, // for platform + "ve": boolean // new verification status } ], - "connection": { - "title": string, - "dateStart": string, - "dateEnd": string, - "source": string + "conn": { + "title": string, // for title + "ds": string, // for dateStart + "de": string, // for dateEnd + "s": string // for source } } ] } - }, - "reasoning": { - "identityVerification": string[], - "confidenceFactors": string[], - "conflicts": string[], - "recommendations": string[] } } -Answer with JSON only and nothing else. +CRITICAL: If you find you cannot fit all high-confidence data in the response: +1. First omit lower confidence attributes +2. Then omit unverified identities +3. Then omit older organizations +4. Finally, only return the most essential and recent data points + +Answer with JSON only and nothing else. Ensure the response is complete and valid JSON. ` - const result = await llmService.consolidateMemberEnrichmentData(memberId, prompt) - svc.log.info({ memberId }, 'LLM result') + const data = await llmService.consolidateMemberEnrichmentData(memberId, prompt) + + if (data.result.confidence >= 0.85) { + svc.log.info({ memberId }, 'LLM returned data with high confidence!') + if (data.result.changes.displayName) { + svc.log.info( + { + memberId, + displayName: data.result.changes.displayName, + oldDisplayName: existingMemberData.displayName, + }, + 'Updating display name!', + ) + + // TODO uros update member data + } + } else { + svc.log.warn({ memberId }, 'LLM returned data with low confidence!') + } } else { svc.log.debug({ memberId }, 'No data to squash for member!') } diff --git a/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts b/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts new file mode 100644 index 0000000000..c78ec9517b --- /dev/null +++ b/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts @@ -0,0 +1,76 @@ +import { MemberEnrichmentSource } from '@crowd/types' + +import { processMemberSources } from '../activities/enrichment' +import { svc } from '../service' + +// we don't need any of these to be running like if we would run this as an actual temporal worker +// we just need pg connection, redis & service logger +process.env['CROWD_TEMPORAL_TASKQUEUE'] = 'not-important' +svc.config.envvars = [] +svc.config.producer = { enabled: false } +svc.config.redis = { enabled: true } +svc.config.temporal = { enabled: false } +svc.config.questdb = { enabled: false } +svc.options.opensearch = { enabled: false } + +const processArguments = process.argv.slice(2) + +if (processArguments.length !== 1) { + process.exit(1) +} + +const tenantId = processArguments[0] + +async function getEnrichableMembers(limit: number, lastMemberId?: string): Promise { + const query = ` + -- only use members that have more than one enrichment source + with members_with_sources as (select distinct "memberId", count(*) + from "memberEnrichmentCache" + group by "memberId" + having count(*) > 1), + -- also only use members that have more than 10 activities + members_with_activities as (select distinct msa."memberId", sum("activityCount") as total_activities + from members_with_sources ms + inner join "memberSegmentsAgg" msa on msa."memberId" = ms."memberId" + -- only consider subprojects otherwise we count some activities multiple times + inner join segments s on s.id = msa."segmentId" and s."tenantId" = $(tenantId) and s.type = 'subproject' + group by msa."memberId" + having sum("activityCount") > 100) + select m.id + from members m + inner join members_with_activities ma on m.id = ma."memberId" + where m."deletedAt" is null and m."tenantId" = $(tenantId) + ${lastMemberId ? `and m.id > $(lastMemberId)` : ''} + and (m."lastEnriched" is null + or m."lastEnriched" < now() - interval '3 months') + order by ma.total_activities desc, m.id + limit $(limit) + ` + + return (await svc.postgres.writer.connection().any(query, { lastMemberId, limit, tenantId })).map( + (row) => row.id, + ) +} + +const sources = Object.values(MemberEnrichmentSource) as MemberEnrichmentSource[] + +setImmediate(async () => { + await svc.init(false) + + const pageSize = 100 + let members = await getEnrichableMembers(pageSize) + // let members = ['8db6c61e-f8b0-400e-ac5d-cb550d8740c9'] + while (members.length > 0) { + svc.log.info({ memberCount: members.length }, 'Processing members!') + // process members just like in enrichMember workflow + for (const memberId of members) { + await processMemberSources(memberId, sources) + } + + // load next page + // members = await getEnrichableMembers(pageSize, members[members.length - 1]) + members = [] + } + + process.exit(0) +}) diff --git a/services/apps/premium/members_enrichment_worker/src/types.ts b/services/apps/premium/members_enrichment_worker/src/types.ts index fe706eacd2..e757c25c2d 100644 --- a/services/apps/premium/members_enrichment_worker/src/types.ts +++ b/services/apps/premium/members_enrichment_worker/src/types.ts @@ -16,6 +16,8 @@ import { IMemberEnrichmentDataProgAILinkedinScraper } from './sources/progai-lin import { IMemberEnrichmentDataProgAI } from './sources/progai/types' import { IMemberEnrichmentDataSerp } from './sources/serp/types' +/* eslint-disable @typescript-eslint/no-explicit-any */ + export interface IEnrichmentSourceInput { memberId: string github?: IMemberIdentity diff --git a/services/libs/common_services/src/services/llm.service.ts b/services/libs/common_services/src/services/llm.service.ts index 9599c83797..e8c7a3ba29 100644 --- a/services/libs/common_services/src/services/llm.service.ts +++ b/services/libs/common_services/src/services/llm.service.ts @@ -8,14 +8,13 @@ import { performance } from 'perf_hooks' import { DbStore } from '@crowd/database' import { Logger, LoggerBase } from '@crowd/logging' import { - ILLMConsumableMember, - ILLMConsumableOrganization, ILlmResponse, ILlmResult, ILlmSettings, LLM_MODEL_PRICING_MAP, LLM_MODEL_REGION_MAP, LLM_SETTINGS, + LlmMemberEnrichmentResult, LlmQueryType, } from '@crowd/types' @@ -150,37 +149,11 @@ export class LlmService extends LoggerBase { public async consolidateMemberEnrichmentData( memberId: string, prompt: string, - ): Promise> { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ): Promise> { const response = await this.queryLlm(LlmQueryType.MEMBER_ENRICHMENT, prompt, memberId) - const result = response.answer - - return { - result, - ...response, - } - } - - public async mergeSuggestion( - type: LlmQueryType.MEMBER_MERGE | LlmQueryType.ORGANIZATION_MERGE, - suggestion: ILLMConsumableMember[] | ILLMConsumableOrganization[], - ): Promise> { - if (suggestion.length !== 2) { - console.log(suggestion) - throw new Error('Exactly 2 entities are required for LLM comparison') - } - - const prompt = type === LlmQueryType.MEMBER_MERGE ? MEMBER_PROMPT : ORGANIZATION_PROMPT - - const fullPrompt = `Your task is to analyze the following two json documents. ${JSON.stringify( - suggestion, - )} . ${prompt}` - - const response = await this.queryLlm(type, fullPrompt, suggestion[0].id, { - secondaryId: suggestion[1].id, - }) - - const result = response.answer === 'true' + const result = JSON.parse(response.answer) return { result, @@ -188,24 +161,3 @@ export class LlmService extends LoggerBase { } } } - -const MEMBER_PROMPT = `Please compare and come up with a boolean answer if these two members are the same person or not. - Only compare data from first member and second member. Never compare data from only one member with itself. - Never tokenize 'platform' field using character tokenization. Use word tokenization for platform field in identities. - You should check all the sent fields between members to find similarities both literally and semantically. - Here are the fields written with respect to their importance and how to check. Identities >> Organizations > Attributes and other fields >> Display name - - 1. Identities: Tokenize value field (identity.value) using character tokenization. Exact match or identities with edit distance <= 2 suggests that members are similar. - Don't compare identities in a single member. Only compare identities between members. - 2. Organizations: Members are more likely to be the same when they have/had roles in similar organizations. - If there are no intersecting organizations it doesn't necessarily mean that they're different members. - 3. Attributes and other fields: If one member have a specific field and other member doesn't, skip that field when deciding similarity. - Checking semantically instead of literally is important for such fields. Important fields here are: location, timezone, languages, programming languages. - For example one member might have Berlin in location, while other can have Germany - consider such members have same location. - 4. Display Name: Tokenize using both character and word tokenization. When the display name is more than one word, and the difference is a few edit distances consider it a strong indication of similarity. - When one display name is contained by the other, check other fields for the final decision. The same members on different platforms might have different display names. - Display names can be multiple words and might be sorted in different order in different platforms for the same member. - Pro tip: If members have identities in the same platform (member1.identities[x].platform === member2.identities[y].platform) and if these identities have different usernames(member1.identities[x].value !== member2.identities[y].value) you can label them as different. - Only do such labeling if both members have identities in the same platform. If they don't have identities in the same platform ignore the pro tip. - Print 'true' if they are the same member, 'false' otherwise. No explanation required. Don't print anything else.` - -const ORGANIZATION_PROMPT = `Please compare and come up with a boolean answer if these two organizations are the same organization or not. Print 'true' if they are the same organization, 'false' otherwise. No explanation required. Don't print anything else.` diff --git a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts new file mode 100644 index 0000000000..a1ac25a3a2 --- /dev/null +++ b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/llmSuggestionVerdicts.repo.ts @@ -0,0 +1,43 @@ +import { randomUUID } from 'crypto' + +import { DbConnection, DbTransaction } from '@crowd/database' +import { Logger } from '@crowd/logging' +import { ILLMSuggestionVerdict } from '@crowd/types' + +class LLMSuggestionVerdictsRepository { + constructor( + private readonly connection: DbConnection | DbTransaction, + private readonly log: Logger, + ) {} + + async saveLLMVerdict(verdict: ILLMSuggestionVerdict): Promise { + const query = ` + insert into "llmSuggestionVerdicts" ("id", "type", "model", "primaryId", "secondaryId", "prompt", "verdict", "inputTokenCount", "outputTokenCount", "responseTimeSeconds", "createdAt") + values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, now()) + returning "id"; + ` + let result: { id: string } + + try { + result = await this.connection.one(query, [ + randomUUID(), + verdict.type, + verdict.model, + verdict.primaryId, + verdict.secondaryId, + verdict.prompt, + verdict.verdict, + verdict.inputTokenCount, + verdict.outputTokenCount, + verdict.responseTimeSeconds, + ]) + } catch (err) { + this.log.error(err) + throw new Error(err) + } + + return result.id + } +} + +export default LLMSuggestionVerdictsRepository diff --git a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts index 89c9d7a403..fa7976f74f 100644 --- a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts +++ b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo.ts @@ -3,7 +3,7 @@ import { Logger } from '@crowd/logging' import { ILLMConsumableMemberDbResult, IMemberMergeSuggestion, - LlmQueryType, + LLMSuggestionVerdictType, MemberMergeSuggestionTable, SuggestionType, } from '@crowd/types' @@ -212,7 +212,6 @@ class MemberMergeSuggestionsRepository { const result: ILLMConsumableMemberDbResult[] = await this.connection.manyOrNone( ` select - mem.id, mem.attributes, mem."displayName", mem."joinedAt", @@ -259,17 +258,17 @@ class MemberMergeSuggestionsRepository { const query = `select * from "memberToMergeRaw" mtmr where not exists ( - select 1 from "llmPromptHistory" lsv + select 1 from "llmSuggestionVerdicts" lsv where ( - lsv."entityId" = mtmr."memberId" and - (lsv.metadata ->> 'secondaryId')::uuid = mtmr."toMergeId" and - lsv.type = '${LlmQueryType.MEMBER_MERGE}' + lsv."primaryId" = mtmr."memberId" and + lsv."secondaryId" = mtmr."toMergeId" and + lsv.type = '${LLMSuggestionVerdictType.MEMBER}' ) or ( - lsv."entityId" = mtmr."toMergeId" and - (lsv.metadata ->> 'secondaryId')::uuid = mtmr."memberId" and - lsv.type = '${LlmQueryType.MEMBER_MERGE}' + lsv."primaryId" = mtmr."toMergeId" and + lsv."secondaryId" = mtmr."memberId" and + lsv.type = '${LLMSuggestionVerdictType.MEMBER}' ) ) ${similarityLTEFilter} diff --git a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts index f60593cf3c..310f210470 100644 --- a/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts +++ b/services/libs/data-access-layer/src/old/apps/merge_suggestions_worker/organizationMergeSuggestions.repo.ts @@ -2,7 +2,7 @@ import { DbConnection, DbTransaction } from '@crowd/database' import { Logger } from '@crowd/logging' import { IOrganizationMergeSuggestion, - LlmQueryType, + LLMSuggestionVerdictType, OrganizationMergeSuggestionTable, SuggestionType, } from '@crowd/types' @@ -205,7 +205,7 @@ class OrganizationMergeSuggestionsRepository { /** * We get raw (unfiltered) suggestions from the database. * When onlyLFXMembers is true it only returns suggestions for lfx member organizations. - * All returned suggestions are checked against the "llmPromptHistory" table to see if they have already been processed. + * All returned suggestions are checked against the "llmSuggestionVerdicts" table to see if they have already been processed. * Already processed suggestions will not be returned. * @param similarityFilter * @param limit @@ -254,17 +254,17 @@ class OrganizationMergeSuggestionsRepository { select distinct s."organizationId", s."toMergeId" from suggestions s where not exists ( - select 1 from "llmPromptHistory" lsv + select 1 from "llmSuggestionVerdicts" lsv where ( - lsv."entityId" = s."organizationId" and - (lsv.metadata ->> 'secondaryId')::uuid = s."toMergeId" and - lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' + lsv."primaryId" = s."organizationId" and + lsv."secondaryId" = s."toMergeId" and + lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' ) or ( - lsv."entityId" = s."toMergeId" and - (lsv.metadata ->> 'secondaryId')::uuid = s."organizationId" and - lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' + lsv."primaryId" = s."toMergeId" and + lsv."secondaryId" = s."organizationId" and + lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' ) ) @@ -274,17 +274,17 @@ class OrganizationMergeSuggestionsRepository { query = `select * from "organizationToMergeRaw" otmr where not exists ( - select 1 from "llmPromptHistory" lsv + select 1 from "llmSuggestionVerdicts" lsv where ( - lsv."entityId" = otmr."organizationId" and - (lsv.metadata ->> 'secondaryId')::uuid = otmr."toMergeId" and - lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' + lsv."primaryId" = otmr."organizationId" and + lsv."secondaryId" = otmr."toMergeId" and + lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' ) or ( - lsv."entityId" = otmr."toMergeId" and - (lsv.metadata ->> 'secondaryId')::uuid = otmr."organizationId" and - lsv.type = '${LlmQueryType.ORGANIZATION_MERGE}' + lsv."primaryId" = otmr."toMergeId" and + lsv."secondaryId" = otmr."organizationId" and + lsv.type = '${LLMSuggestionVerdictType.ORGANIZATION}' ) ) ${similarityLTEFilter} diff --git a/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts b/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts index 37a0a4ef3c..19db4194a4 100644 --- a/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts +++ b/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts @@ -34,7 +34,8 @@ export async function fetchMemberDataForLLMSquashing( select m."displayName", m.attributes, m."manuallyChangedFields", - (select json_agg( + coalesce( + (select json_agg( (select row_to_json(r) from (select mi.type, mi.platform, @@ -42,8 +43,10 @@ export async function fetchMemberDataForLLMSquashing( ) from "memberIdentities" mi where mi."memberId" = m.id - and verified = true) as identities, - json_agg( + and verified = true), '[]'::json) as identities, + coalesce( + nullif( + json_agg( (select row_to_json(r) from (select mo."orgId", mo."orgName", @@ -51,8 +54,11 @@ export async function fetchMemberDataForLLMSquashing( mo."dateStart", mo."dateEnd", mo.source) r) - ) as organizations - + )::jsonb, + jsonb_build_array(null) + )::json, + '[]'::json + ) as organizations from members m left join member_orgs mo on mo."memberId" = m.id where m.id = $(memberId) diff --git a/services/libs/types/src/enums/llm.ts b/services/libs/types/src/enums/llm.ts index 9649bb1954..910076a1dd 100644 --- a/services/libs/types/src/enums/llm.ts +++ b/services/libs/types/src/enums/llm.ts @@ -4,7 +4,5 @@ export enum LlmModelType { } export enum LlmQueryType { - MEMBER_MERGE = 'member_merge_suggestion', - ORGANIZATION_MERGE = 'organization_merge_suggestion', MEMBER_ENRICHMENT = 'member_enrichment', } diff --git a/services/libs/types/src/enums/merging.ts b/services/libs/types/src/enums/merging.ts index 3eb3a67d88..9ca2460b16 100644 --- a/services/libs/types/src/enums/merging.ts +++ b/services/libs/types/src/enums/merging.ts @@ -28,3 +28,8 @@ export enum MemberRoleUnmergeStrategy { SAME_MEMBER = 'same-member', SAME_ORGANIZATION = 'same-organization', } + +export enum LLMSuggestionVerdictType { + MEMBER = 'member', + ORGANIZATION = 'organization', +} diff --git a/services/libs/types/src/llm.ts b/services/libs/types/src/llm.ts index 8bf991c125..20194bfd14 100644 --- a/services/libs/types/src/llm.ts +++ b/services/libs/types/src/llm.ts @@ -45,29 +45,68 @@ export const LLM_MODEL_PRICING_MAP: Record = { } export const LLM_SETTINGS: Record = { - [LlmQueryType.MEMBER_MERGE]: { - modelId: LlmModelType.CLAUDE_3_OPUS, - arguments: { - max_tokens: 2000, - anthropic_version: 'bedrock-2023-05-31', - temperature: 0, - }, - }, - [LlmQueryType.ORGANIZATION_MERGE]: { - modelId: LlmModelType.CLAUDE_3_OPUS, - arguments: { - max_tokens: 2000, - anthropic_version: 'bedrock-2023-05-31', - temperature: 0, - }, - }, [LlmQueryType.MEMBER_ENRICHMENT]: { modelId: LlmModelType.CLAUDE_3_5_SONNET, arguments: { - // TODO uros check if this is ok max_tokens: 200000, anthropic_version: 'bedrock-2023-05-31', temperature: 0, }, }, } + +export interface LlmIdentity { + t: string // type + v: string // value + p: string // platform + ve: boolean // verification status +} + +export interface LlmOrganizationConnection { + orgId: string + t: string // title + ds: string // dateStart + de: string // dateEnd + s: string // source +} + +export interface LlmNewOrganization { + n: string // name + i: LlmIdentity[] // identities + conn: { + title: string + ds: string // dateStart + de: string // dateEnd + s: string // source + } +} + +export interface LlmAttributeValue { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + default?: any + // eslint-disable-next-line @typescript-eslint/no-explicit-any + [source: string]: any +} + +export interface LlmMemberEnrichmentResult { + confidence: number + changes: { + displayName: string + identities: { + updateExisting: LlmIdentity[] + new: LlmIdentity[] + } + attributes: { + update: { + [attributeName: string]: LlmAttributeValue + } + new: { + [attributeName: string]: LlmAttributeValue + } + } + organizations: { + newConns: LlmOrganizationConnection[] + newOrgs: LlmNewOrganization[] + } + } +} diff --git a/services/libs/types/src/members.ts b/services/libs/types/src/members.ts index 3b787c4cd1..ff58d79494 100644 --- a/services/libs/types/src/members.ts +++ b/services/libs/types/src/members.ts @@ -144,7 +144,6 @@ export interface IMemberRenderFriendlyRole { } export interface ILLMConsumableMemberDbResult { - id: string displayName: string attributes: IAttributes joinedAt: string @@ -160,7 +159,6 @@ export interface ILLMConsumableMemberDbResult { } export interface ILLMConsumableMember { - id: string displayName: string attributes: IAttributes joinedAt: string diff --git a/services/libs/types/src/merging.ts b/services/libs/types/src/merging.ts index 96a5289f6c..3a974caf75 100644 --- a/services/libs/types/src/merging.ts +++ b/services/libs/types/src/merging.ts @@ -12,6 +12,7 @@ import { IOrganization, ITag, ITask, + LLMSuggestionVerdictType, MergeActionState, MergeActionStep, MergeActionType, @@ -92,3 +93,17 @@ export interface IOrganizationUnmergePreviewResult extends IOrganization { memberCount: number activityCount: number } + +export interface ILLMSuggestionVerdict { + id?: string + type: LLMSuggestionVerdictType + model: string + primaryId: string + secondaryId: string + prompt: string + verdict: string + inputTokenCount: number + outputTokenCount: number + responseTimeSeconds: number + createdAt?: string +} diff --git a/services/libs/types/src/organizations.ts b/services/libs/types/src/organizations.ts index 2462a285e0..4f3b4c66fb 100644 --- a/services/libs/types/src/organizations.ts +++ b/services/libs/types/src/organizations.ts @@ -166,7 +166,6 @@ export interface ILLMConsumableOrganizationDbResult { } export interface ILLMConsumableOrganization { - id: string displayName: string description: string phoneNumbers: string[] From 6304b887eeada767357943ddf2a3f9b4cc0a83dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Thu, 14 Nov 2024 15:33:28 +0100 Subject: [PATCH 4/6] wip --- .../V1731052735__llm-prompt-history.sql | 35 ++- .../src/service/organization.service.ts | 127 +---------- .../src/activities/enrichment.ts | 206 +++++++++++++++--- .../src/bin/onboarding.ts | 94 +++++++- .../members_enrichment_worker/index.ts | 90 ++++---- .../src/organizations/attributesConfig.ts | 1 + .../src/organizations/organizations.ts | 124 ++++++++++- .../libs/types/src/enums/organizations.ts | 2 + services/libs/types/src/llm.ts | 4 +- services/libs/types/src/premium/enrichment.ts | 1 + 10 files changed, 482 insertions(+), 202 deletions(-) diff --git a/backend/src/database/migrations/V1731052735__llm-prompt-history.sql b/backend/src/database/migrations/V1731052735__llm-prompt-history.sql index 41a3a9adba..8b9bcd900b 100644 --- a/backend/src/database/migrations/V1731052735__llm-prompt-history.sql +++ b/backend/src/database/migrations/V1731052735__llm-prompt-history.sql @@ -14,4 +14,37 @@ create table "llmPromptHistory" ( create index "ix_llmPromptHistory_type_entityId" on "llmPromptHistory"("type", "entityId"); create index "ix_llmPromptHistory_entityId" on "llmPromptHistory"("entityId"); -create index "ix_llmPromptHistory_type" on "llmPromptHistory"("type"); \ No newline at end of file +create index "ix_llmPromptHistory_type" on "llmPromptHistory"("type"); + +-- backup members table +create table members_backup_14_11_2024 as +select * +from members + with no data; + +-- Copy all data +insert into members_backup_14_11_2024 +select * +from members; + +-- backup memberIdentities table +create table member_identities_backup_14_11_2024 as +select * +from "memberIdentities" + with no data; + +-- Copy all data +insert into member_identities_backup_14_11_2024 +select * +from "memberIdentities"; + +-- backup memberOrganizations table +create table member_organizations_backup_14_11_2024 as +select * +from "memberOrganizations" + with no data; + +-- Copy all data +insert into member_organizations_backup_14_11_2024 +select * +from "memberOrganizations"; \ No newline at end of file diff --git a/services/apps/data_sink_worker/src/service/organization.service.ts b/services/apps/data_sink_worker/src/service/organization.service.ts index 8cfbdcfbc9..0de6fd9114 100644 --- a/services/apps/data_sink_worker/src/service/organization.service.ts +++ b/services/apps/data_sink_worker/src/service/organization.service.ts @@ -1,22 +1,14 @@ -import { websiteNormalizer } from '@crowd/common' import { DbStore } from '@crowd/data-access-layer/src/database' import IntegrationRepository from '@crowd/data-access-layer/src/old/apps/data_sink_worker/repo/integration.repo' import { - addOrgIdentity, addOrgToSyncRemote, addOrgsToMember, addOrgsToSegments, findMemberOrganizations, - findOrgAttributes, + findOrCreateOrganization, findOrgBySourceId, findOrgByVerifiedIdentity, getOrgIdentities, - insertOrganization, - markOrgAttributeDefault, - prepareOrganizationData, - updateOrganization, - upsertOrgAttributes, - upsertOrgIdentities, } from '@crowd/data-access-layer/src/organizations' import { dbStoreQx } from '@crowd/data-access-layer/src/queryExecutor' import { Logger, LoggerBase, getChildLogger } from '@crowd/logging' @@ -24,7 +16,6 @@ import { IMemberOrganization, IOrganization, IOrganizationIdSource, - OrganizationIdentityType, PlatformType, } from '@crowd/types' @@ -42,117 +33,13 @@ export class OrganizationService extends LoggerBase { integrationId: string, data: IOrganization, ): Promise { - const verifiedIdentities = data.identities ? data.identities.filter((i) => i.verified) : [] - if (verifiedIdentities.length === 0) { - const message = `Missing organization identity while creating/updating organization!` - this.log.error(data, message) - throw new Error(message) - } - - try { - const id = await this.store.transactionally(async (txStore) => { - const qe = dbStoreQx(txStore) - - // Normalize the website identities - for (const identity of data.identities.filter((i) => - [ - OrganizationIdentityType.PRIMARY_DOMAIN, - OrganizationIdentityType.ALTERNATIVE_DOMAIN, - ].includes(i.type), - )) { - identity.value = websiteNormalizer(identity.value, false) - } - - let existing - - // find existing org by sent verified identities - for (const identity of verifiedIdentities) { - existing = await findOrgByVerifiedIdentity(qe, tenantId, identity) - if (existing) { - break - } - } - - let id - - if (existing) { - this.log.trace(`Found existing organization, organization will be updated!`) - - const existingAttributes = await findOrgAttributes(qe, existing.id) - - const processed = prepareOrganizationData(data, source, existing, existingAttributes) - - this.log.trace({ updateData: processed.organization }, `Updating organization!`) - - if (Object.keys(processed.organization).length > 0) { - this.log.info({ orgId: existing.id }, `Updating organization!`) - await updateOrganization(qe, existing.id, processed.organization) - } - await upsertOrgIdentities(qe, existing.id, tenantId, data.identities, integrationId) - await upsertOrgAttributes(qe, existing.id, processed.attributes) - for (const attr of processed.attributes) { - if (attr.default) { - await markOrgAttributeDefault(qe, existing.id, attr) - } - } - - id = existing.id - } else { - this.log.trace(`Organization wasn't found via website or identities.`) - const firstVerified = verifiedIdentities[0] - - const payload = { - displayName: firstVerified.value, - description: data.description, - logo: data.logo, - tags: data.tags, - employees: data.employees, - location: data.location, - type: data.type, - size: data.size, - headline: data.headline, - industry: data.industry, - founded: data.founded, - } - - const processed = prepareOrganizationData(payload, source) - - this.log.trace({ payload: processed }, `Creating new organization!`) - - // if it doesn't exists create it - id = await insertOrganization(qe, tenantId, processed.organization) - - await upsertOrgAttributes(qe, id, processed.attributes) - for (const attr of processed.attributes) { - if (attr.default) { - await markOrgAttributeDefault(qe, id, attr) - } - } - - // create identities - for (const i of data.identities) { - // add the identity - await addOrgIdentity(qe, { - organizationId: id, - tenantId, - platform: i.platform, - type: i.type, - value: i.value, - verified: i.verified, - sourceId: i.sourceId, - integrationId, - }) - } - } - - return id - }) - + const id = await this.store.transactionally(async (txStore) => { + const qe = dbStoreQx(txStore) + const id = await findOrCreateOrganization(qe, tenantId, source, data, integrationId) return id - } catch (err) { - this.log.error(err, 'Error while upserting an organization!') - throw err - } + }) + + return id } public async addToMember( diff --git a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts index 3c3163aacd..ea5adb3a31 100644 --- a/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts +++ b/services/apps/premium/members_enrichment_worker/src/activities/enrichment.ts @@ -1,18 +1,31 @@ import { LlmService } from '@crowd/common_services' +import { updateMemberAttributes } from '@crowd/data-access-layer' import { findMemberIdentityWithTheMostActivityInPlatform as findMemberIdentityWithTheMostActivityInPlatformQuestDb } from '@crowd/data-access-layer/src/activities' +import { + updateVerifiedFlag, + upsertMemberIdentity, +} from '@crowd/data-access-layer/src/member_identities' import { fetchMemberDataForLLMSquashing, findMemberEnrichmentCacheDb, findMemberEnrichmentCacheForAllSourcesDb, insertMemberEnrichmentCacheDb, + insertWorkExperience, touchMemberEnrichmentCacheUpdatedAtDb, + updateLastEnrichedDate, updateMemberEnrichmentCacheDb, } from '@crowd/data-access-layer/src/old/apps/premium/members_enrichment_worker' +import { findOrCreateOrganization } from '@crowd/data-access-layer/src/organizations' +import { dbStoreQx } from '@crowd/data-access-layer/src/queryExecutor' import { RedisCache } from '@crowd/redis' import { IEnrichableMemberIdentityActivityAggregate, IMemberEnrichmentCache, MemberEnrichmentSource, + MemberIdentityType, + OrganizationAttributeSource, + OrganizationIdentityType, + OrganizationSource, } from '@crowd/types' import { EnrichmentSourceServiceFactory } from '../factory' @@ -145,7 +158,10 @@ export async function processMemberSources( ): Promise { svc.log.debug({ memberId }, 'Processing member sources!') + // without contributions since they take a lot of space const toBeSquashed = {} + + // just the contributions if we need them later on const toBeSquashedContributions = {} // find if there's already saved enrichment data in source @@ -158,7 +174,6 @@ export async function processMemberSources( cache.data, )) as IMemberEnrichmentDataNormalized - // TODO uros temp remove contributions from sources to mitigate context size if (Array.isArray(normalized)) { const normalizedContributions = [] for (const n of normalized) { @@ -192,8 +207,6 @@ export async function processMemberSources( const existingMemberData = await fetchMemberDataForLLMSquashing(svc.postgres.reader, memberId) svc.log.info({ memberId }, 'Squashing data for member using LLM!') - // TODO uros Implement data squasher using LLM & actual member entity enrichment logic - const llmService = new LlmService( svc.postgres.writer, { @@ -224,9 +237,13 @@ Your task is to return ONLY THE CHANGES needed to update the existing member dat * Different LinkedIn profiles are found for the same person 2. DATA CONSOLIDATION RULES -- For identities: +- For member identities: + * Only include identities with type "username" or "email" * Only include highest confidence identities (verified or multi-source) * Prioritize professional identities (LinkedIn, GitHub) over social ones +- For organization identities: + * Only include identities with types: "email", "affiliated-profile", "primary-domain", "username", "alternative-domain" + * Exclude any organizations without valid identity types - For attributes: * Only include attributes with clear evidence from multiple sources * Prioritize professional attributes (title, location, skills) over others @@ -241,9 +258,9 @@ Format your response as a JSON object matching this structure: "changes": { "displayName": string, "identities": { - "updateExisting": [ // updates to existing identities + "update": [ // updates to existing identities { - "t": string, // for type + "t": string, // type: must be one of ${Object.values(MemberIdentityType).join(', ')} "v": string, // for value "p": string, // for platform "ve": boolean // new verification status @@ -251,7 +268,7 @@ Format your response as a JSON object matching this structure: ], "new": [ // completely new identities { - "t": string, // for type + "t": string, // type: must be one of ${Object.values(MemberIdentityType).join(', ')} "v": string, // for value "p": string, // for platform "ve": boolean // new verification status @@ -275,29 +292,29 @@ Format your response as a JSON object matching this structure: "organizations": { "newConns": [ // new connections to existing organizations { - "orgId": string, // for organizationId - should match one of the UUIDs of orgs from EXISTING_VERIFIED_MEMBER_DATA + "orgId": string, // for organizationId - MUST match an existing organizationId from organizations array in EXISTING_VERIFIED_MEMBER_DATA. If organizations array is empty, newConns must be empty "t": string, // for title "ds": string, // for dateStart "de": string, // for dateEnd "s": string // for source } ], - "newOrgs": [ // completely new organizations to create + "newOrgs": [ // completely new organizations to create when no match found in EXISTING_VERIFIED_MEMBER_DATA organizations array { "n": string, // for org name - "i": [ // identities + "i": [ // identities - must only include supported types and also must include at least one verified identity { - "t": string, // for type + "t": string, // type: must be one of ${Object.values(OrganizationIdentityType).join(', ')} "v": string, // for value "p": string, // for platform "ve": boolean // new verification status } ], "conn": { - "title": string, // for title + "t": string, // for title "ds": string, // for dateStart "de": string, // for dateEnd - "s": string // for source + "s": string // for source: must be one of ${Object.values(OrganizationSource).join(', ')} } } ] @@ -305,36 +322,173 @@ Format your response as a JSON object matching this structure: } } -CRITICAL: If you find you cannot fit all high-confidence data in the response: +CRITICAL VALIDATION RULES: +1. Member identities MUST ONLY have type ${Object.values(MemberIdentityType).join(', ')} +2. Organization identities MUST ONLY have types: ${Object.values(OrganizationIdentityType).join(', ')} +3. Organization sources MUST ONLY have sources: ${Object.values(OrganizationSource).join(', ')} +4. Exclude any identities or organizations that don't meet these type restrictions +5. newConns array must ONLY contain connections to organizations that exist in EXISTING_VERIFIED_MEMBER_DATA organizations array +6. If EXISTING_VERIFIED_MEMBER_DATA organizations array is empty, newConns must be empty array +7. Any organization not found in EXISTING_VERIFIED_MEMBER_DATA organizations array must go into newOrgs + +If you find you cannot fit all high-confidence data in the response: 1. First omit lower confidence attributes 2. Then omit unverified identities 3. Then omit older organizations 4. Finally, only return the most essential and recent data points Answer with JSON only and nothing else. Ensure the response is complete and valid JSON. - ` +` const data = await llmService.consolidateMemberEnrichmentData(memberId, prompt) if (data.result.confidence >= 0.85) { svc.log.info({ memberId }, 'LLM returned data with high confidence!') - if (data.result.changes.displayName) { - svc.log.info( - { - memberId, - displayName: data.result.changes.displayName, - oldDisplayName: existingMemberData.displayName, - }, - 'Updating display name!', + await svc.postgres.writer.transactionally(async (tx) => { + const qx = dbStoreQx(tx) + const promises = [] + + // process attributes + let update = false + let attributes = existingMemberData.attributes + + if (data.result.changes.attributes) { + if (data.result.changes.attributes.update) { + attributes = { ...attributes, ...data.result.changes.attributes.update } + update = true + } + + if (data.result.changes.attributes.new) { + attributes = { ...attributes, ...data.result.changes.attributes.new } + update = true + } + } + + if (update) { + svc.log.info({ memberId }, 'Updating member attributes!') + promises.push(updateMemberAttributes(qx, memberId, attributes)) + } + + // process identities + if (data.result.changes.identities) { + const identityTypes = Object.values(MemberIdentityType) + + if (data.result.changes.identities.update) { + for (const toUpdate of data.result.changes.identities.update) { + if (identityTypes.includes(toUpdate.t as MemberIdentityType)) { + svc.log.info({ memberId, toUpdate }, 'Updating verified flag for identity!') + promises.push( + updateVerifiedFlag(qx, { + memberId, + tenantId: existingMemberData.tenantId, + platform: toUpdate.p, + type: toUpdate.t as MemberIdentityType, + value: toUpdate.v, + verified: toUpdate.ve, + }), + ) + } else { + svc.log.warn({ memberId, toUpdate }, 'Unknown identity type!') + } + } + } + + if (data.result.changes.identities.new) { + for (const toAdd of data.result.changes.identities.new) { + if (identityTypes.includes(toAdd.t as MemberIdentityType)) { + svc.log.info({ memberId, toAdd }, 'Adding new identity!') + promises.push( + upsertMemberIdentity(qx, { + memberId, + tenantId: existingMemberData.tenantId, + platform: toAdd.p, + type: toAdd.t as MemberIdentityType, + value: toAdd.v, + verified: toAdd.ve, + }), + ) + } else { + svc.log.warn({ memberId, toAdd }, 'Unknown identity type!') + } + } + } + } + + // process organizations + if (data.result.changes.organizations) { + const sources = Object.values(OrganizationSource) + + if (data.result.changes.organizations.newConns) { + for (const conn of data.result.changes.organizations.newConns) { + if (sources.includes(conn.s as OrganizationSource)) { + svc.log.info({ memberId, conn }, 'Adding new connection to existing organization!') + promises.push( + insertWorkExperience( + tx.transaction(), + memberId, + conn.orgId, + conn.t, + conn.ds, + conn.de, + conn.s as OrganizationSource, + ), + ) + } else { + svc.log.warn({ memberId, conn }, 'Unknown organization source!') + } + } + } + + if (data.result.changes.organizations.newOrgs) { + for (const org of data.result.changes.organizations.newOrgs) { + svc.log.info({ memberId, org }, 'Adding new organization!') + promises.push( + findOrCreateOrganization( + qx, + existingMemberData.tenantId, + OrganizationAttributeSource.ENRICHMENT, + { + displayName: org.n, + identities: org.i.map((i) => { + return { + type: i.t as OrganizationIdentityType, + platform: i.p, + value: i.v, + verified: i.ve, + } + }), + }, + ).then((orgId) => + insertWorkExperience( + tx.transaction(), + memberId, + orgId, + org.conn.t, + org.conn.ds, + org.conn.de, + org.conn.s as OrganizationSource, + ), + ), + ) + } + } + } + + // also touch members.lastEnriched date + promises.push( + updateLastEnrichedDate(tx.transaction(), memberId, existingMemberData.tenantId), ) - // TODO uros update member data - } + await Promise.all(promises) + }) + + svc.log.debug({ memberId }, 'Member sources processed successfully!') + return true } else { svc.log.warn({ memberId }, 'LLM returned data with low confidence!') } } else { - svc.log.debug({ memberId }, 'No data to squash for member!') + svc.log.warn({ memberId }, 'No data to squash for member!') } return false diff --git a/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts b/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts index c78ec9517b..6c66f28de4 100644 --- a/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts +++ b/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts @@ -1,3 +1,4 @@ +import { timeout } from '@crowd/common' import { MemberEnrichmentSource } from '@crowd/types' import { processMemberSources } from '../activities/enrichment' @@ -21,6 +22,9 @@ if (processArguments.length !== 1) { const tenantId = processArguments[0] +const minMemberActivities = 100 +const maxConcurrentProcessing = 5 + async function getEnrichableMembers(limit: number, lastMemberId?: string): Promise { const query = ` -- only use members that have more than one enrichment source @@ -35,7 +39,7 @@ async function getEnrichableMembers(limit: number, lastMemberId?: string): Promi -- only consider subprojects otherwise we count some activities multiple times inner join segments s on s.id = msa."segmentId" and s."tenantId" = $(tenantId) and s.type = 'subproject' group by msa."memberId" - having sum("activityCount") > 100) + having sum("activityCount") > $(minMemberActivities)) select m.id from members m inner join members_with_activities ma on m.id = ma."memberId" @@ -47,9 +51,11 @@ async function getEnrichableMembers(limit: number, lastMemberId?: string): Promi limit $(limit) ` - return (await svc.postgres.writer.connection().any(query, { lastMemberId, limit, tenantId })).map( - (row) => row.id, - ) + return ( + await svc.postgres.writer + .connection() + .any(query, { lastMemberId, limit, tenantId, minMemberActivities }) + ).map((row) => row.id) } const sources = Object.values(MemberEnrichmentSource) as MemberEnrichmentSource[] @@ -57,20 +63,92 @@ const sources = Object.values(MemberEnrichmentSource) as MemberEnrichmentSource[ setImmediate(async () => { await svc.init(false) + let processingCount = 0 + let updatedMembersCount = 0 + let skippedMembersCount = 0 + let failedMembersCount = 0 + + let totalProcessingTime = 0 + let processedMembersCount = 0 + const REPORT_INTERVAL = 10 + const pageSize = 100 let members = await getEnrichableMembers(pageSize) - // let members = ['8db6c61e-f8b0-400e-ac5d-cb550d8740c9'] + let pagePromises: Promise[] = [] while (members.length > 0) { svc.log.info({ memberCount: members.length }, 'Processing members!') // process members just like in enrichMember workflow for (const memberId of members) { - await processMemberSources(memberId, sources) + while (processingCount >= maxConcurrentProcessing) { + await timeout(100) + } + + processingCount++ + const startTime = Date.now() + + const promise = processMemberSources(memberId, sources) + .then((res) => { + processingCount-- + if (res) { + const processingTime = Date.now() - startTime + totalProcessingTime += processingTime + processedMembersCount++ + + updatedMembersCount++ + } else { + skippedMembersCount++ + } + + // Report average processing time every REPORT_INTERVAL members + if (processedMembersCount > 0 && processedMembersCount % REPORT_INTERVAL === 0) { + const averageProcessingTime = totalProcessingTime / processedMembersCount + svc.log.info( + { + averageProcessingTime: `${(averageProcessingTime / 1000).toFixed(2)}s`, + processedMembers: processedMembersCount, + updatedMembers: updatedMembersCount, + skippedMembers: skippedMembersCount, + failedMembers: failedMembersCount, + }, + 'Processing time statistics', + ) + } + }) + .catch((err) => { + processingCount-- + svc.log.error(err, { memberId }, 'Error while processing member enrichment sources!') + failedMembersCount++ + }) + pagePromises.push(promise) } + await Promise.all(pagePromises) + pagePromises = [] // load next page - // members = await getEnrichableMembers(pageSize, members[members.length - 1]) - members = [] + members = await getEnrichableMembers(pageSize, members[members.length - 1]) + + svc.log.info( + { + updatedMembersCount, + skippedMembersCount, + failedMembersCount, + averageProcessingTime: `${(totalProcessingTime / processedMembersCount / 1000).toFixed(2)}s`, + }, + 'Current statistics!', + ) } + svc.log.info( + { + updatedMembersCount, + skippedMembersCount, + failedMembersCount, + averageProcessingTime: `${(totalProcessingTime / processedMembersCount / 1000).toFixed(2)}s`, + totalProcessedMembers: processedMembersCount, + totalProcessingTime: `${(totalProcessingTime / 1000).toFixed(2)}s`, + }, + 'Final statistics!', + ) + process.exit(0) }) diff --git a/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts b/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts index 19db4194a4..b42f1997fb 100644 --- a/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts +++ b/services/libs/data-access-layer/src/old/apps/premium/members_enrichment_worker/index.ts @@ -20,50 +20,52 @@ export async function fetchMemberDataForLLMSquashing( const result = await db.connection().oneOrNone( ` with member_orgs as (select distinct mo."memberId", - mo."organizationId" as "orgId", - o."displayName" as "orgName", - mo.title as "jobTitle", - mo."dateStart", - mo."dateEnd", - mo.source - from "memberOrganizations" mo - inner join organizations o on mo."organizationId" = o.id - where mo."memberId" = $(memberId) - and mo."deletedAt" is null - and o."deletedAt" is null) - select m."displayName", - m.attributes, - m."manuallyChangedFields", - coalesce( - (select json_agg( - (select row_to_json(r) - from (select mi.type, - mi.platform, - mi.value) r) - ) - from "memberIdentities" mi - where mi."memberId" = m.id - and verified = true), '[]'::json) as identities, - coalesce( - nullif( - json_agg( - (select row_to_json(r) - from (select mo."orgId", - mo."orgName", - mo."jobTitle", - mo."dateStart", - mo."dateEnd", - mo.source) r) - )::jsonb, - jsonb_build_array(null) - )::json, - '[]'::json - ) as organizations - from members m - left join member_orgs mo on mo."memberId" = m.id - where m.id = $(memberId) - and m."deletedAt" is null - group by m.id, m."displayName", m.attributes, m."manuallyChangedFields"; + mo."organizationId" as "orgId", + o."displayName" as "orgName", + mo.title as "jobTitle", + mo."dateStart", + mo."dateEnd", + mo.source + from "memberOrganizations" mo + inner join organizations o on mo."organizationId" = o.id + where mo."memberId" = $(memberId) + and mo."deletedAt" is null + and o."deletedAt" is null) + select m."displayName", + m.attributes, + m."manuallyChangedFields", + m."tenantId", + coalesce( + (select json_agg( + (select row_to_json(r) + from (select mi.type, + mi.platform, + mi.value) r) + ) + from "memberIdentities" mi + where mi."memberId" = m.id + and verified = true), '[]'::json) as identities, + case + when exists (select 1 from member_orgs where "memberId" = m.id) + then ( + select json_agg( + (select row_to_json(r) + from (select mo."orgId", + mo."orgName", + mo."jobTitle", + mo."dateStart", + mo."dateEnd", + mo.source) r) + ) + from member_orgs mo + where mo."memberId" = m.id + ) + else '[]'::json + end as organizations + from members m + where m.id = $(memberId) + and m."deletedAt" is null + group by m.id, m."displayName", m.attributes, m."manuallyChangedFields"; `, { memberId, diff --git a/services/libs/data-access-layer/src/organizations/attributesConfig.ts b/services/libs/data-access-layer/src/organizations/attributesConfig.ts index 5775c76e2f..27a48d6b52 100644 --- a/services/libs/data-access-layer/src/organizations/attributesConfig.ts +++ b/services/libs/data-access-layer/src/organizations/attributesConfig.ts @@ -231,4 +231,5 @@ export const ORG_DB_ATTRIBUTE_SOURCE_PRIORITY = [ OrganizationAttributeSource.PDL, OrganizationAttributeSource.EMAIL, OrganizationAttributeSource.GITHUB, + OrganizationAttributeSource.ENRICHMENT, ] diff --git a/services/libs/data-access-layer/src/organizations/organizations.ts b/services/libs/data-access-layer/src/organizations/organizations.ts index 632adaa0d7..f6993f5984 100644 --- a/services/libs/data-access-layer/src/organizations/organizations.ts +++ b/services/libs/data-access-layer/src/organizations/organizations.ts @@ -1,21 +1,29 @@ -import { generateUUIDv1 } from '@crowd/common' +import { generateUUIDv1, websiteNormalizer } from '@crowd/common' +import { getServiceChildLogger } from '@crowd/logging' import { IMemberOrganization, + IOrganization, IOrganizationIdSource, IQueryTimeseriesParams, ITimeseriesDatapoint, + OrganizationIdentityType, SyncStatus, } from '@crowd/types' import { QueryExecutor } from '../queryExecutor' import { prepareSelectColumns } from '../utils' +import { findOrgAttributes, markOrgAttributeDefault, upsertOrgAttributes } from './attributes' +import { addOrgIdentity, upsertOrgIdentities } from './identities' import { IDbOrgIdentity, IDbOrganization, IDbOrganizationInput, IEnrichableOrganizationData, } from './types' +import { prepareOrganizationData } from './utils' + +const log = getServiceChildLogger('data-access-layer/organizations') const ORG_SELECT_COLUMNS = [ 'id', @@ -440,3 +448,117 @@ export async function getTimeseriesOfActiveOrganizations( return qx.select(query, params) } + +export async function findOrCreateOrganization( + qe: QueryExecutor, + tenantId: string, + source: string, + data: IOrganization, + integrationId?: string, +): Promise { + const verifiedIdentities = data.identities ? data.identities.filter((i) => i.verified) : [] + if (verifiedIdentities.length === 0) { + const message = `Missing organization identity while creating/updating organization!` + log.error(data, message) + throw new Error(message) + } + + try { + // Normalize the website identities + for (const identity of data.identities.filter((i) => + [ + OrganizationIdentityType.PRIMARY_DOMAIN, + OrganizationIdentityType.ALTERNATIVE_DOMAIN, + ].includes(i.type), + )) { + identity.value = websiteNormalizer(identity.value, false) + } + + let existing + + // find existing org by sent verified identities + for (const identity of verifiedIdentities) { + existing = await findOrgByVerifiedIdentity(qe, tenantId, identity) + if (existing) { + break + } + } + + let id + + if (existing) { + log.trace(`Found existing organization, organization will be updated!`) + + const existingAttributes = await findOrgAttributes(qe, existing.id) + + const processed = prepareOrganizationData(data, source, existing, existingAttributes) + + log.trace({ updateData: processed.organization }, `Updating organization!`) + + if (Object.keys(processed.organization).length > 0) { + log.info({ orgId: existing.id }, `Updating organization!`) + await updateOrganization(qe, existing.id, processed.organization) + } + await upsertOrgIdentities(qe, existing.id, tenantId, data.identities, integrationId) + await upsertOrgAttributes(qe, existing.id, processed.attributes) + for (const attr of processed.attributes) { + if (attr.default) { + await markOrgAttributeDefault(qe, existing.id, attr) + } + } + + id = existing.id + } else { + log.trace(`Organization wasn't found via website or identities.`) + const firstVerified = verifiedIdentities[0] + + const payload = { + displayName: firstVerified.value, + description: data.description, + logo: data.logo, + tags: data.tags, + employees: data.employees, + location: data.location, + type: data.type, + size: data.size, + headline: data.headline, + industry: data.industry, + founded: data.founded, + } + + const processed = prepareOrganizationData(payload, source) + + log.trace({ payload: processed }, `Creating new organization!`) + + // if it doesn't exists create it + id = await insertOrganization(qe, tenantId, processed.organization) + + await upsertOrgAttributes(qe, id, processed.attributes) + for (const attr of processed.attributes) { + if (attr.default) { + await markOrgAttributeDefault(qe, id, attr) + } + } + + // create identities + for (const i of data.identities) { + // add the identity + await addOrgIdentity(qe, { + organizationId: id, + tenantId, + platform: i.platform, + type: i.type, + value: i.value, + verified: i.verified, + sourceId: i.sourceId, + integrationId, + }) + } + } + + return id + } catch (err) { + log.error(err, 'Error while upserting an organization!') + throw err + } +} diff --git a/services/libs/types/src/enums/organizations.ts b/services/libs/types/src/enums/organizations.ts index 34dfd708cd..b7fbed720d 100644 --- a/services/libs/types/src/enums/organizations.ts +++ b/services/libs/types/src/enums/organizations.ts @@ -32,6 +32,7 @@ export enum OrganizationIdentityType { PRIMARY_DOMAIN = 'primary-domain', ALTERNATIVE_DOMAIN = 'alternative-domain', AFFILIATED_PROFILE = 'affiliated-profile', + EMAIL = 'email', } export enum OrganizationAttributeSource { @@ -39,4 +40,5 @@ export enum OrganizationAttributeSource { PDL = 'peopledatalabs', EMAIL = 'email', GITHUB = 'github', + ENRICHMENT = 'enrichment', } diff --git a/services/libs/types/src/llm.ts b/services/libs/types/src/llm.ts index 20194bfd14..7dbfade447 100644 --- a/services/libs/types/src/llm.ts +++ b/services/libs/types/src/llm.ts @@ -74,7 +74,7 @@ export interface LlmNewOrganization { n: string // name i: LlmIdentity[] // identities conn: { - title: string + t: string // title ds: string // dateStart de: string // dateEnd s: string // source @@ -93,7 +93,7 @@ export interface LlmMemberEnrichmentResult { changes: { displayName: string identities: { - updateExisting: LlmIdentity[] + update: LlmIdentity[] new: LlmIdentity[] } attributes: { diff --git a/services/libs/types/src/premium/enrichment.ts b/services/libs/types/src/premium/enrichment.ts index 08ffc350fd..a356a6e285 100644 --- a/services/libs/types/src/premium/enrichment.ts +++ b/services/libs/types/src/premium/enrichment.ts @@ -45,6 +45,7 @@ export interface IMemberOriginalData { displayName: string attributes: Record> manuallyChangedFields: string[] + tenantId: string // memberIdentities table data identities: IMemberIdentity[] From 1aba43940291bf0702e50d2cbe3b6dfab0f40d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Thu, 14 Nov 2024 15:34:26 +0100 Subject: [PATCH 5/6] fix --- pnpm-lock.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 872d4a8f68..cca8ed58c5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1058,9 +1058,6 @@ importers: '@crowd/common': specifier: workspace:* version: link:../../libs/common - '@crowd/common_services': - specifier: workspace:* - version: link:../../libs/common_services '@crowd/data-access-layer': specifier: workspace:* version: link:../../libs/data-access-layer From d47bed70296e4d42dda6ea80f60f5a5c68aa0f92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Thu, 14 Nov 2024 17:13:03 +0100 Subject: [PATCH 6/6] small changes --- services/apps/premium/members_enrichment_worker/package.json | 1 + .../premium/members_enrichment_worker/src/bin/onboarding.ts | 4 +++- .../members_enrichment_worker/src/workflows/enrichMember.ts | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/services/apps/premium/members_enrichment_worker/package.json b/services/apps/premium/members_enrichment_worker/package.json index 2bd7c8c344..9555616b72 100644 --- a/services/apps/premium/members_enrichment_worker/package.json +++ b/services/apps/premium/members_enrichment_worker/package.json @@ -6,6 +6,7 @@ "start:debug": "CROWD_TEMPORAL_TASKQUEUE=members-enrichment SERVICE=members-enrichment-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", "dev:local": "nodemon --watch src --watch ../../../libs --ext ts --exec pnpm run start:debug:local", "dev": "nodemon --watch src --watch ../../../libs --ext ts --exec pnpm run start:debug", + "script:onboarding:local": "set -a && . ../../../../backend/.env.dist.local && . ../../../../backend/.env.override.local && set +a && SERVICE=script tsx --inspect src/bin/onboarding.ts", "script:onboarding": "SERVICE=script tsx --inspect src/bin/onboarding.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", diff --git a/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts b/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts index 6c66f28de4..9a972522a5 100644 --- a/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts +++ b/services/apps/premium/members_enrichment_worker/src/bin/onboarding.ts @@ -20,6 +20,7 @@ if (processArguments.length !== 1) { process.exit(1) } +// TODO maybe add segmentId as parameter here as well const tenantId = processArguments[0] const minMemberActivities = 100 @@ -30,9 +31,10 @@ async function getEnrichableMembers(limit: number, lastMemberId?: string): Promi -- only use members that have more than one enrichment source with members_with_sources as (select distinct "memberId", count(*) from "memberEnrichmentCache" + where data is not null group by "memberId" having count(*) > 1), - -- also only use members that have more than 10 activities + -- also only use members that have more than 100 activities members_with_activities as (select distinct msa."memberId", sum("activityCount") as total_activities from members_with_sources ms inner join "memberSegmentsAgg" msa on msa."memberId" = ms."memberId" diff --git a/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts b/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts index 0f35355edd..9cc8423527 100644 --- a/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts +++ b/services/apps/premium/members_enrichment_worker/src/workflows/enrichMember.ts @@ -19,7 +19,6 @@ const { updateMemberEnrichmentCache, isCacheObsolete, findMemberIdentityWithTheMostActivityInPlatform, - processMemberSources, } = proxyActivities({ startToCloseTimeout: '20 seconds', retry: { @@ -101,6 +100,7 @@ export async function enrichMember( if (changeInEnrichmentSourceData) { // Member enrichment data has been updated, use squasher again! - await processMemberSources(input.id, sources) + // TODO member enrichment: enable once we are sure it's working + // await processMemberSources(input.id, sources) } }