Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import/idcc #105

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"nodemailer": "^6.9.4",
"nodemailer-html-to-text": "^3.2.0",
"pino-pretty": "^10.2.3",
"puppeteer": "^22.0.0",
"rate-limiter-flexible": "^2.4.2",
"shared": "workspace:*",
"unzipper": "^0.10.14",
Expand Down
36 changes: 36 additions & 0 deletions server/src/common/apis/legiFrance/legifrance.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import getApiClient from "../client";

const legifranceClient = getApiClient(
{
baseURL: "https://www.legifrance.gouv.fr",
},
{ cache: false }
);

export const fetchConventionCollective = async (Kali: string) => {
try {
const { data } = await legifranceClient.get(`/conv_coll/id/${Kali}`);

return data;
} catch (error) {
console.log(error);
}
};

export const fetchAjaxSearchConventionCollective = async ({
etat_juridique,
page,
}: {
etat_juridique: string;
page: number;
}) => {
try {
const { data } = await legifranceClient.get(
`/listeIdcc/ajax?facetteTexteBase=TEXTE_BASE&facetteEtat=${etat_juridique}&sortValue=DATE_UPDATE&pageSize=500&page=${page}&tab_selection=all`
);

return data;
} catch (error) {
console.log(error);
}
};
65 changes: 65 additions & 0 deletions server/src/modules/jobs/importer/legiFrance/conventionToPdf.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { ElementHandle, Page } from "puppeteer";

import parentLogger from "@/common/logger";

import { getStaticFilePath } from "../../../../common/utils/getStaticFilePath";
import { gotoUrl, initPuppeteer } from "./puppeteer";

const logger = parentLogger.child({ module: "convention" });

async function getLinkElementDetails(element: ElementHandle<HTMLAnchorElement>) {
return {
href: await (await element.getProperty("href")).jsonValue(),
text: await element.evaluate((el: any) => el.textContent),
};
}

async function convertToPdf({ page, href, text, code }: { page: Page; href: string; text: string; code: string }) {
await gotoUrl({ page, url: href });
await page.pdf({
path: getStaticFilePath(`./${code}/${text.replaceAll(" ", "_")}.pdf`),
// margin: { top: "100px", right: "50px", bottom: "100px", left: "50px" },
printBackground: true,
format: "A4",
});
}

export async function runConventionToPDF() {
logger.info("Convention to pdf...");

const { browser, page } = await initPuppeteer();

await gotoUrl({ page, url: `https://www.legifrance.gouv.fr/conv_coll/id/KALICONT000005635917` });

const listeSommaire = await page.$("#liste-sommaire");
const lis = await listeSommaire.$$(":scope > li");

const baseLinkElement = await lis[0].$(":scope > a");
const textDeBase = await getLinkElementDetails(baseLinkElement);

const textAttachesLis = await lis[1].$$(":scope > ul > li");
const textAttaches = [];
for (const textAttachesLi of textAttachesLis) {
if (!(await textAttachesLi.evaluate((el: any) => el.style.display === "none")))
textAttaches.push(await getLinkElementDetails(await textAttachesLi.$(":scope > a")));
}

const textSalairesLis = await lis[2].$$(":scope > ul > li");
const textSalaires = [];
for (const textSalairesLi of textSalairesLis) {
if (!(await textSalairesLi.evaluate((el: any) => el.style.display === "none")))
textSalaires.push(await getLinkElementDetails(await textSalairesLi.$(":scope > a")));
}

await convertToPdf({ page, code: "1671", ...textDeBase });

for (const textAttache of textAttaches) {
await convertToPdf({ page, code: "1671", ...textAttache });
}

for (const textSalaire of textSalaires) {
await convertToPdf({ page, code: "1671", ...textSalaire });
}

await browser.close();
}
29 changes: 29 additions & 0 deletions server/src/modules/jobs/importer/legiFrance/puppeteer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import puppeteer, { Page } from "puppeteer";

import { sleep } from "../../../../common/utils/asyncUtils";

export const initPuppeteer = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();

return { browser, page };
};

async function waitForEvent(page: Page, eventName: string) {
return page.evaluate((event: string) => {
return new Promise((resolve, _) => {
document.addEventListener(event, resolve);
});
}, eventName);
}

export const waitPageLoad = async (page: Page) => {
await Promise.race([waitForEvent(page, "load"), sleep(8000)]);
};

export const gotoUrl = async ({ page, url }: { page: Page; url: string }): Promise<void> => {
await page.goto(url, {
waitUntil: "domcontentloaded",
});
await waitPageLoad(page);
};
134 changes: 134 additions & 0 deletions server/src/modules/jobs/importer/legiFrance/scrapLegiFrance.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import { unlink, writeFile } from "node:fs/promises";

import { Page } from "puppeteer";

import parentLogger from "@/common/logger";

import { fetchAjaxSearchConventionCollective } from "../../../../common/apis/legiFrance/legifrance";
import { getStaticFilePath } from "../../../../common/utils/getStaticFilePath";
import { gotoUrl, initPuppeteer } from "./puppeteer";

const logger = parentLogger.child({ module: "import:legifrance" });

const cacheDetailsCC = new Map();
async function getDetailsCC({ page, url, code }: { page: Page; url: string; code: string }) {
const result = cacheDetailsCC.get(code);
if (result) {
return result;
}
await gotoUrl({ page, url });
const subElement = await page.$(".picto-list > li:nth-child(2)");
if (!subElement) {
const details = {
soustitre: "",
};
cacheDetailsCC.set(code, details);
return details;
}
const soustitre = await subElement.evaluate((el: any) => el.textContent);
const details = {
soustitre,
};
cacheDetailsCC.set(code, details);
return details;
}

export async function runLegiFranceImporter() {
logger.info("Geting legifrance ...");

const { browser, page } = await initPuppeteer();
const pageCC = await browser.newPage();

const categories = [
{
etat_juridique: "VIGUEUR_ETEN",
pages: 2,
},
{
etat_juridique: "VIGUEUR_NON_ETEN",
pages: 1,
},
{
etat_juridique: "ABROGE",
pages: 1,
},
{
etat_juridique: "MODIFIE",
pages: 1,
},
{
etat_juridique: "PERIME",
pages: 1,
},
{
etat_juridique: "DENONCE",
pages: 1,
},
{
etat_juridique: "REMPLACE",
pages: 1,
},
];
const results = new Map();
for (const categorie of categories) {
console.log(categorie);
const categorieResults = new Map();
for (let pagecount = 1; pagecount <= categorie.pages; pagecount++) {
const { content, nbResult } = await fetchAjaxSearchConventionCollective({
etat_juridique: categorie.etat_juridique,
page: pagecount,
});
console.log({ nbResult }); // content

await writeFile(getStaticFilePath("./page.html"), content);

await gotoUrl({ page, url: `file://${getStaticFilePath("./page.html")}` });

const elements = await page.$$(".h4.code-title-convention");

for (const elementHandle of elements) {
let value = await elementHandle.evaluate((el: any) => el.textContent);
value = value.trim().replace("IDCC", "");
const code = value.trim().padStart(4, "0");

const grandParentElement = await page.evaluateHandle((node) => {
return node.parentElement?.parentElement ?? null;
}, elementHandle);
// @ts-expect-error
const link = await grandParentElement.$(":scope > h2 > div:nth-child(2) > a");

const linkValue = await link.evaluate((el: any) => el.textContent);

const rawHref = await (await link.getProperty("href")).jsonValue();
const match = RegExp(/file:\/\/(.*)\?origin/).exec(rawHref);
// @ts-expect-error
const url = `https://www.legifrance.gouv.fr${match[1]}`;

const details = await getDetailsCC({ page: pageCC, url, code });

categorieResults.set(code, {
code,
etat_juridique: categorie.etat_juridique,
url,
titre: linkValue,
soustitre: details.soustitre,
});
}
}
categorieResults.forEach((categorieResult) => {
const result = results.get(categorieResult.code);
if (!result) {
results.set(categorieResult.code, [categorieResult]);
} else {
results.set(categorieResult.code, [...result, categorieResult]);
}
});
}
const resultsObj = Object.fromEntries(results);
// console.log(resultsObj);
console.log(Object.keys(resultsObj).length);
await writeFile(getStaticFilePath("./results.json"), JSON.stringify(resultsObj, null, 2));

await unlink(getStaticFilePath("./page.html"));
await browser.close();
}
8 changes: 8 additions & 0 deletions server/src/modules/jobs/jobs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import { recreateIndexes } from "./db/recreateIndexes";
import { validateModels } from "./db/schemaValidation";
import { runAcceImporter } from "./importer/acce/acce";
import { runBcnImporter } from "./importer/bcn/bcn.importer";
import { runConventionToPDF } from "./importer/legiFrance/conventionToPdf";
import { runLegiFranceImporter } from "./importer/legiFrance/scrapLegiFrance";
import { runReferentielImporter } from "./importer/referentiel/referentiel";

export async function setupJobProcessor() {
Expand Down Expand Up @@ -78,6 +80,12 @@ export async function setupJobProcessor() {
"import:referentiel": {
handler: async () => runReferentielImporter(),
},
"import:legifrance": {
handler: async () => runLegiFranceImporter(),
},
convention: {
handler: async () => runConventionToPDF(),
},
},
});
}
Loading
Loading