Skip to content

Commit

Permalink
Determine internal vs external links at parse time (#884)
Browse files Browse the repository at this point in the history
Part of #876 to split the
internal and external link checkers into distinct programs. To do this,
it's useful to split out internal vs. external links at parse time,
whereas before we did it inside `FileBatch`. The external link checker
won't use `FileBatch`.

This is only a refactor and doesn't change the program, other than now
using `HEAD` for external link requests rather than `GET`. We also now
use a `Set` for anchors so that checking if an anchor is included is
faster.

---------

Co-authored-by: Arnau Casau <[email protected]>
  • Loading branch information
Eric-Arellano and arnaucasau authored Feb 26, 2024
1 parent 74b544a commit abde486
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 86 deletions.
6 changes: 4 additions & 2 deletions scripts/commands/checkLinks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ async function main() {

const fileBatches = await determineFileBatches(args);
const otherFiles = [
...(await globby("public/**/*")).map((fp) => new File(fp, [])),
...SYNTHETIC_FILES.map((fp) => new File(fp, [], true)),
...(await globby("public/{images,videos}/**/*")).map(
(fp) => new File(fp, new Set()),
),
...SYNTHETIC_FILES.map((fp) => new File(fp, new Set(), true)),
];

let allGood = true;
Expand Down
1 change: 1 addition & 0 deletions scripts/lib/links/ExternalLink.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export class ExternalLink {
try {
const response = await fetch(this.value, {
headers: { "User-Agent": "qiskit-documentation-broken-links-finder" },
method: "HEAD",
});
if (response.status >= 300) {
error = `Could not find link '${this.value}'`;
Expand Down
12 changes: 10 additions & 2 deletions scripts/lib/links/FileBatch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,23 @@ import { addLinksToMap } from "./FileBatch";
test("addLinksToMap()", () => {
const linksToMap = new Map();

addLinksToMap("file1.md", ["https://ibm.com", "./relative"], linksToMap);
addLinksToMap(
"file1.md",
new Set(["https://ibm.com", "./relative"]),
linksToMap,
);
expect(linksToMap).toEqual(
new Map([
["https://ibm.com", ["file1.md"]],
["./relative", ["file1.md"]],
]),
);

addLinksToMap("file2.md", ["./relative", "/images/my_image.png"], linksToMap);
addLinksToMap(
"file2.md",
new Set(["./relative", "/images/my_image.png"]),
linksToMap,
);
expect(linksToMap).toEqual(
new Map([
["https://ibm.com", ["file1.md"]],
Expand Down
69 changes: 36 additions & 33 deletions scripts/lib/links/FileBatch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,43 +66,36 @@ export class FileBatch {
* 2. A list of InternalLink objects to validate.
* 3. A list of ExternalLink objects to validate.
*/
async load(): Promise<[File[], InternalLink[], ExternalLink[]]> {
async load(
loadExternalLinks: boolean,
): Promise<[File[], InternalLink[], ExternalLink[]]> {
const files: File[] = [];
for (let filePath of this.toLoad) {
const parsed = await parseFile(filePath);
files.push(new File(filePath, parsed.anchors));
}

const linksToOriginFiles = new Map<string, string[]>();
const internalLinksToOriginFiles = new Map<string, string[]>();
const externalLinksToOriginFiles = new Map<string, string[]>();
for (const filePath of this.toCheck) {
const parsed = await parseFile(filePath);
files.push(new File(filePath, parsed.anchors));
if (!IGNORED_FILES.has(filePath)) {
addLinksToMap(filePath, parsed.rawLinks, linksToOriginFiles);
}
}

const internalLinks: InternalLink[] = [];
const externalLinks: ExternalLink[] = [];
for (let [linkPath, originFiles] of linksToOriginFiles) {
if (ALWAYS_IGNORED_URLS.has(linkPath)) {
continue;
}
originFiles = originFiles.filter(
(originFile) =>
FILES_TO_IGNORES[originFile] == null ||
!FILES_TO_IGNORES[originFile].includes(linkPath),
);

if (originFiles.length > 0) {
if (linkPath.startsWith("http")) {
externalLinks.push(new ExternalLink(linkPath, originFiles));
} else {
internalLinks.push(new InternalLink(linkPath, originFiles));
}
addLinksToMap(filePath, parsed.internalLinks, internalLinksToOriginFiles);
if (loadExternalLinks) {
addLinksToMap(
filePath,
parsed.externalLinks,
externalLinksToOriginFiles,
);
}
}

const internalLinks = Array.from(internalLinksToOriginFiles.entries()).map(
([link, originFiles]) => new InternalLink(link, originFiles),
);
const externalLinks = Array.from(externalLinksToOriginFiles.entries()).map(
([link, originFiles]) => new ExternalLink(link, originFiles),
);
return [files, internalLinks, externalLinks];
}

Expand All @@ -111,19 +104,21 @@ export class FileBatch {
*
* Logs the results to the console and returns `true` if there were no issues.
*/
async check(externalLinks: boolean, otherFiles: File[]): Promise<boolean> {
async check(
checkExternalLinks: boolean,
otherFiles: File[],
): Promise<boolean> {
console.log(`\n\nChecking links for ${this.description}`);

const [docsFiles, internalLinkList, externalLinkList] = await this.load();
const [docsFiles, internalLinkList, externalLinkList] =
await this.load(checkExternalLinks);
const existingFiles = docsFiles.concat(otherFiles);

const results = internalLinkList.map((link) => link.check(existingFiles));

if (externalLinks) {
// For loop reduces the risk of rate-limiting.
for (let link of externalLinkList) {
results.push(await link.check());
}
// For loop reduces the risk of rate-limiting.
for (let link of externalLinkList) {
results.push(await link.check());
}

let allGood = true;
Expand All @@ -139,10 +134,18 @@ export class FileBatch {

export function addLinksToMap(
filePath: string,
links: string[],
links: Set<string>,
linksToOriginFiles: Map<string, string[]>,
): void {
if (IGNORED_FILES.has(filePath)) return;
links.forEach((link) => {
if (
ALWAYS_IGNORED_URLS.has(link) ||
FILES_TO_IGNORES[filePath]?.includes(link)
) {
return;
}

const entry = linksToOriginFiles.get(link);
if (entry === undefined) {
linksToOriginFiles.set(link, [filePath]);
Expand Down
34 changes: 20 additions & 14 deletions scripts/lib/links/InternalLink.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ describe("Test the constructor of InternalLink", () => {
describe("Validate links", () => {
test("existing with absolute path", () => {
let testLink = new InternalLink("/testpath", ["/testorigin.mdx"]);
let testFile = new File("docs/testpath.mdx", []);
let testFile = new File("docs/testpath.mdx", new Set());
const results = testLink.check([testFile]);
expect(results).toBeUndefined();
});
Expand All @@ -47,7 +47,7 @@ describe("Validate links", () => {
let testLink = new InternalLink("/test-alternative-path", [
"/testorigin.mdx",
]);
let testFile = new File("docs/testpath.mdx", []);
let testFile = new File("docs/testpath.mdx", new Set());
const results = testLink.check([testFile]);
expect(results).toEqual(
"❌ Could not find link '/test-alternative-path'. Appears in:\n /testorigin.mdx",
Expand All @@ -58,7 +58,7 @@ describe("Validate links", () => {
let testLink = new InternalLink("../testpath", [
"docs/test/testorigin.mdx",
]);
let testFile = new File("docs/testpath.mdx", []);
let testFile = new File("docs/testpath.mdx", new Set());
const results = testLink.check([testFile]);
expect(results).toBeUndefined();
});
Expand All @@ -67,7 +67,7 @@ describe("Validate links", () => {
let testLink = new InternalLink("../testpath", [
"docs/test1/test2/testorigin.mdx",
]);
let testFile = new File("docs/testpath.mdx", []);
let testFile = new File("docs/testpath.mdx", new Set());
const results = testLink.check([testFile]);
expect(results).toEqual(
"❌ Could not find link '../testpath'. Appears in:\n docs/test1/test2/testorigin.mdx",
Expand All @@ -81,8 +81,8 @@ describe("Validate links", () => {
"docs/test/test3/testorigin.mdx",
"docs/test/test2/test4/testorigin.mdx",
]);
let testFile1 = new File("docs/testpath.mdx", []);
let testFile2 = new File("docs/test/test2/testpath.mdx", []);
let testFile1 = new File("docs/testpath.mdx", new Set());
let testFile2 = new File("docs/test/test2/testpath.mdx", new Set());
const results = testLink.check([testFile1, testFile2]);
expect(results).toBeUndefined();
});
Expand All @@ -94,8 +94,8 @@ describe("Validate links", () => {
"docs/test/test3/testorigin.mdx",
"docs/test/test2/test4/testorigin.mdx",
]);
let testFile1 = new File("docs/test/testpath.mdx", []);
let testFile2 = new File("docs/test2/test3/testpath.mdx", []);
let testFile1 = new File("docs/test/testpath.mdx", new Set());
let testFile2 = new File("docs/test2/test3/testpath.mdx", new Set());
const results = testLink.check([testFile1, testFile2]);
expect(results).toEqual(
"❌ Could not find link '/testpath'. Appears in:\n" +
Expand All @@ -115,8 +115,8 @@ describe("Validate links", () => {
"docs/test/test3/testorigin.mdx",
"docs/test/test2/test4/testorigin.mdx",
]);
let testFile1 = new File("docs/testpath.mdx", []);
let testFile2 = new File("docs/test/test2/testpath.mdx", []);
let testFile1 = new File("docs/testpath.mdx", new Set());
let testFile2 = new File("docs/test/test2/testpath.mdx", new Set());
const results = testLink.check([testFile1, testFile2]);
expect(results).toEqual(
"❌ Could not find link '../testpath'. Appears in:\n docs/test/test2/testorigin.mdx\n docs/test/test3/testorigin.mdx",
Expand All @@ -127,7 +127,7 @@ describe("Validate links", () => {
let testLink = new InternalLink("/testpath#test_anchor", [
"/testorigin.mdx",
]);
let testFile = new File("docs/testpath.mdx", ["#test_anchor"]);
let testFile = new File("docs/testpath.mdx", new Set(["#test_anchor"]));
const results = testLink.check([testFile]);
expect(results).toBeUndefined();
});
Expand All @@ -136,7 +136,10 @@ describe("Validate links", () => {
let testLink = new InternalLink("/testpath#test_anchor", [
"/testorigin.mdx",
]);
let testFile = new File("docs/testpath.mdx", ["#test_diff_anchor"]);
let testFile = new File(
"docs/testpath.mdx",
new Set(["#test_diff_anchor"]),
);
const results = testLink.check([testFile]);
expect(results).toEqual(
"❌ Could not find link '/testpath#test_anchor'. Appears in:\n /testorigin.mdx ❓ Did you mean '/testpath#test_diff_anchor'?",
Expand All @@ -147,7 +150,7 @@ describe("Validate links", () => {
let testLink = new InternalLink("../testpath#test_anchor", [
"docs/test/testorigin.mdx",
]);
let testFile = new File("docs/testpath.mdx", ["#test_anchor"]);
let testFile = new File("docs/testpath.mdx", new Set(["#test_anchor"]));
const results = testLink.check([testFile]);
expect(results).toBeUndefined();
});
Expand All @@ -156,7 +159,10 @@ describe("Validate links", () => {
let testLink = new InternalLink("../testpath#test-anchor", [
"docs/test/testorigin.mdx",
]);
let testFile = new File("docs/testpath.mdx", ["#test_diff_anchor"]);
let testFile = new File(
"docs/testpath.mdx",
new Set(["#test_diff_anchor"]),
);
const results = testLink.check([testFile]);
expect(results).toEqual(
"❌ Could not find link '../testpath#test-anchor'. Appears in:\n docs/test/testorigin.mdx ❓ Did you mean '/testpath#test_diff_anchor'?",
Expand Down
8 changes: 4 additions & 4 deletions scripts/lib/links/InternalLink.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ const CONTENT_FILE_EXTENSIONS = [".md", ".mdx", ".ipynb"];

export class File {
readonly path: string;
readonly anchors: string[];
readonly anchors: Set<string>;
readonly synthetic: boolean;

/**
* path: Path to the file
* anchors: Anchors available in the file
*/
constructor(path: string, anchors: string[], synthetic: boolean = false) {
constructor(path: string, anchors: Set<string>, synthetic: boolean = false) {
this.path = path;
this.anchors = anchors;
this.synthetic = synthetic;
Expand Down Expand Up @@ -96,7 +96,7 @@ export class InternalLink {
existingFile.path == filePath &&
(this.anchor == "" ||
existingFile.synthetic == true ||
existingFile.anchors.includes(this.anchor)),
existingFile.anchors.has(this.anchor)),
),
);
}
Expand All @@ -122,7 +122,7 @@ export class InternalLink {
if (score < minScoreLink) {
minScoreLink = score;
suggestionPath = file.path;
suggestionPathAnchors = file.anchors;
suggestionPathAnchors = Array.from(file.anchors);
}
});

Expand Down
28 changes: 14 additions & 14 deletions scripts/lib/links/extractLinks.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ test("parseAnchors()", () => {
## \`code-header\`
`);
expect(result).toEqual([
"#my-top-level-heading",
"#header-2",
"#code-header",
"#this-is-a-hardcoded-anchor",
"#another_span",
]);
expect(result).toEqual(
new Set([
"#my-top-level-heading",
"#header-2",
"#code-header",
"#this-is-a-hardcoded-anchor",
"#another_span",
]),
);
});

test("parseLinks()", async () => {
Expand All @@ -81,11 +83,9 @@ test("parseLinks()", async () => {
<a href="./explicit-anchor">Explicit anchor</a>
`;
const result = await parseLinks(markdown);
expect(result).toEqual([
"https://ibm.com",
"./relative",
"/images/my_image.png",
"./explicit-anchor",
]);
const [internalLinks, externalLinks] = await parseLinks(markdown);
expect(internalLinks).toEqual(
new Set(["./relative", "/images/my_image.png", "./explicit-anchor"]),
);
expect(externalLinks).toEqual(new Set(["https://ibm.com"]));
});
Loading

0 comments on commit abde486

Please sign in to comment.