Skip to content

Commit

Permalink
Add RepoData class
Browse files Browse the repository at this point in the history
  • Loading branch information
domoscargin committed Dec 27, 2024
1 parent 7c0a6bb commit bc8c164
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 213 deletions.
5 changes: 5 additions & 0 deletions NOTES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## Current status

The new approach is working up to a point. There was a snafu with not awaiting async functions, and then there was a function I wasn't returning from. Then the rate limit hit!

Next step is to run the script again and keep bugfixing.
256 changes: 52 additions & 204 deletions build-filtered-data.mjs
Original file line number Diff line number Diff line change
@@ -1,26 +1,12 @@
import { writeFileSync } from 'fs'
import { Readable } from 'stream'
import { json2csv } from 'json-2-csv'
import { RequestError } from 'octokit'
import JSONStream from 'JSONStream'
import es from 'event-stream'

import * as yarnLock from '@yarnpkg/lockfile'
import checkDenyList from './helpers/check-deny-list.mjs'
import checkServiceOwner from './helpers/check-service-owner.mjs'
import {
NoPackageJsonError,
CouldntReadPackageError,
IndirectDependencyError,
handleError,
} from './helpers/error-handling.mjs'
import {
getRepoMetaData,
getLatestCommit,
getFileContent,
getRepoTree,
getRemainingRateLimit,
} from './helpers/octokit.mjs'
import denyList from './helpers/data/deny-list.json' assert { type: 'json' }
import governmentServiceOwners from './helpers/data/service-owners.json' assert { type: 'json' }
import { handleError } from './helpers/error-handling.mjs'
import { getRemainingRateLimit } from './helpers/octokit.mjs'
import { RepoData } from './helpers/repo-data.mjs'

import rawDeps from './data/raw-deps.json' assert { type: 'json' }

Expand All @@ -35,7 +21,7 @@ async function filterDeps() {
let batchCounter = 0
console.log(`${performance.now()}: Analysis BEGIN`)

for (const repo of rawDeps.all_public_dependent_repos) {
for (const repo of rawDeps.all_public_dependent_repos.slice(0, 10)) {
try {
console.log(`${performance.now()}: Getting repo data...`)
const repoData = await analyseRepo(repo)
Expand Down Expand Up @@ -74,216 +60,78 @@ async function filterDeps() {
console.log(`${performance.now()}: We're done!`)
}

function log(message) {
console.log(`${performance.now()}: ${message}`)
}

async function analyseRepo(repo) {
// Output data columns
const repoOwner = repo.owner
const repoName = repo.repo_name
let builtByGovernment = false
let indirectDependency = false
let isPrototype = false
let frontendVersion = null
let lockfileType = null
let versionDoubt = false
let couldntAccess = false
let lastUpdated = null
let repoCreated = null
let parentDependency = null
const repoData = new RepoData(repoOwner, repoName, governmentServiceOwners)

try {
if (checkDenyList(repoName, repoOwner)) {
console.log(
`${performance.now()}: ${
repo.name
} is on the 'deny' list and will not be processed`
)
if (repoData.checkDenyList(denyList)) {
log(`${repo.name} is on the 'deny' list and will not be processed`)
return null
}
log(`Analyzing ${repo.name}...`)

builtByGovernment = checkServiceOwner(repoOwner)
if (builtByGovernment) {
console.log(
`${performance.now()}: ${repo.name} looks like a GOV.UK service.`
)
await repoData.fetchAndValidateMetaData()
log(`${repo.name} metadata fetched and validated`)
await repoData.fetchAndValidateRepoTree()
log(`${repo.name} tree fetched and validated`)

if (repoData.builtByGovernment) {
log(`${repo.name} looks like a GOV.UK service.`)
} else {
console.log(
`${performance.now()}: ${
repo.name
} looks like it ISN'T a GOV.UK service. This has been noted.`
log(
`${repo.name} looks like it ISN'T a GOV.UK service. This has been noted.`
)
}

// Get repo data
const repoMetaData = await getRepoMetaData(repoOwner, repoName)
if (repoMetaData) {
lastUpdated = repoMetaData.data.pushed_at
repoCreated = repoMetaData.data.created_at
}

const latestCommit = await getLatestCommit(repoOwner, repoName)
const repoTree = await getRepoTree(repoOwner, repoName, latestCommit.sha)
if (!repoTree.data.tree.find((file) => file.path == 'package.json')) {
indirectDependency = true
throw new NoPackageJsonError()
}

// Handle Package.json
// TODO: account for multiple package files
if (repoTree.data.tree.find((file) => file.path == 'package-lock.json')) {
lockfileType = 'package-lock.json'
} else if (repoTree.data.tree.find((file) => file.path == 'yarn.lock')) {
lockfileType = 'yarn.lock'
}

const packageFile = await getFileContent(
repoOwner,
repoName,
'package.json'
)
const packageFile = await repoData.getRepoFileContent('package.json')
const packageObject = JSON.parse(packageFile.data)
if (!('dependencies' in packageObject)) {
indirectDependency = true
throw new CouldntReadPackageError()
}

// Prototype checking
isPrototype =
repoTree.data.tree.find((file) => file.path == 'lib/usage_data.js') !=
undefined || 'govuk-prototype-kit' in packageObject.dependencies
if (isPrototype) {
console.log(
`${performance.now()}: ${
repo.name
} looks like an instance of the prototype kit. This has been noted.`
if (repoData.checkPrototype(packageObject)) {
log(
`${repo.name} looks like an instance of the prototype kit. This has been noted.`
)
repoData.isPrototype = true
}

// Handle indirect dependencies
if (!('govuk-frontend' in packageObject.dependencies)) {
indirectDependency = true
throw new IndirectDependencyError()
// TODO: Create a findIndirectDependencies function, add an array of the parents to the output column
if (
!('dependencies' in packageObject) ||
!('govuk-frontend' in packageObject.dependencies)
) {
repoData.indirectDependency = true
}

frontendVersion = packageObject.dependencies['govuk-frontend']
console.log(
`${performance.now()}: ${
repo.name
} is using GOV.UK Frontend version ${frontendVersion}`
)
// TODO: Since we only search the Packagelock file if we find a frontend version
// we don't need to do anything but search for the `node_modules/govuk-frontend` entry
// in the getExactFrontendVersion function.
// If however, we don't find govuk-frontend in the dependencies, then we have an indirect dependency
// and we should search the lockfile for the govuk-frontend sub-dependencies
if (frontendVersion.includes('^') || frontendVersion.includes('~')) {
frontendVersion = await getExactFrontendVersion(
repoOwner,
repoName,
frontendVersion,
lockfileType,
parentDependency
if (!repoData.indirectDependency) {
repoData.frontendVersion = packageObject.dependencies['govuk-frontend']
log(
`${repo.name} is using GOV.UK Frontend version ${repoData.frontendVersion}`
)
versionDoubt =
frontendVersion.includes('^') || frontendVersion.includes('~')
}

if (
repoData.frontendVersion.startsWith('^') ||
repoData.frontendVersion.startsWith('~') ||
repoData.indirectDependency
) {
repoData.versionDoubt = true
repoData.frontendVersion = await repoData.getVersionFromLockfile()
}
} catch (error) {
handleError(error, repoName)
repoData.errorThrown = error.toString()
if (error instanceof RequestError) {
couldntAccess = true
versionDoubt = true
repoData.couldntAccess = true
repoData.versionDoubt = true
}
}

return {
repoOwner,
repoName,
couldntAccess,
frontendVersion,
versionDoubt,
builtByGovernment,
indirectDependency,
isPrototype,
lastUpdated,
repoCreated,
parentDependency,
}
}

async function getExactFrontendVersion(
repoOwner,
repoName,
frontendVersion,
lockfileType,
parentDependency
) {
try {
if (lockfileType === 'package-lock.json') {
const packageLockFile = await getFileContent(
repoOwner,
repoName,
'package-lock.json'
)
const versionAndParent = await getFrontendVersionFromPackageLock(
packageLockFile.data
)
// eslint-disable-next-line no-unused-vars
parentDependency = versionAndParent.parent
return versionAndParent.version || frontendVersion
} else if (lockfileType === 'yarn.lock') {
const yarnLockFile = await getFileContent(
repoOwner,
repoName,
'yarn.lock'
)
const yarnLockObject = yarnLock.default.parse(yarnLockFile.data)
return (
yarnLockObject.object[`govuk-frontend@${frontendVersion}`]?.version ||
frontendVersion
)
}
} catch (error) {
console.log('There was a problem with processing the lockfile:', error)
}
return frontendVersion.replace('^', '').replace('~', '')
}

// TODO: Streaming is probably overkill.
async function getFrontendVersionFromPackageLock(packageLockText) {
const stream = Readable.from([packageLockText])

// Parse top-level keys to track parents
const parser = JSONStream.parse('*')

return new Promise((resolve, reject) => {
let result = { version: null, parent: null }

stream.pipe(parser).pipe(
es
.mapSync((data) => {
Object.entries(data).forEach(([parentKey, value]) => {
if (parentKey === 'node_modules/govuk-frontend') {
console.log(
`${performance.now()}: Found the node_modules/govuk-frontend package entry, version ${
data[parentKey].version
}`
)
result = { version: data[parentKey].version, parent: null }
} else if (value.dependencies?.['govuk-frontend']) {
if (parentKey) {
console.log(
`${performance.now()}: Found govuk-frontend as a dependency of: ${parentKey}. This has been noted.`
)
}
result = {
version: value.dependencies['govuk-frontend'].version,
parent: parentKey,
}
}
})
})
.on('end', () => resolve(result))
.on('error', reject)
)
})
return repoData.getResult()
}

async function writeBatchToFiles(builtData) {
Expand All @@ -295,7 +143,7 @@ async function writeBatchToFiles(builtData) {
// Write CSV file
const csv = json2csv(builtData)
await writeFileSync(`data/${yyyymmdd}-${timestamp}-filtered-data.csv`, csv)
console.log(`${performance.now()}: Data file updated with batch of entries`)
log(`Data file updated with batch of entries`)
}

filterDeps()
3 changes: 0 additions & 3 deletions helpers/check-deny-list.mjs

This file was deleted.

3 changes: 0 additions & 3 deletions helpers/check-service-owner.mjs

This file was deleted.

9 changes: 6 additions & 3 deletions helpers/error-handling.mjs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { RequestError } from 'octokit'

export class NoPackageJsonError extends Error {}
export class CouldntReadPackageError extends Error {}
export class NoDirectDependenciesError extends Error {}
export class IndirectDependencyError extends Error {}
export class NoDataError extends Error {}

/**
* Logs errors
Expand All @@ -19,11 +20,13 @@ export function handleError(error, repoName) {
error.message
}`
)
} else if (error instanceof NoDataError) {
console.log(`${performance.now()}: Couldn't fetch data for ${repoName}.`)
} else if (error instanceof NoPackageJsonError) {
console.log(
`${performance.now()}: ${repoName} doesn't have a package.json at its project root. Assuming indirect usage of GOV.UK Frontend.`
`${performance.now()}: ${repoName} doesn't have a package.json at its project root. This has been noted.`
)
} else if (error instanceof CouldntReadPackageError) {
} else if (error instanceof NoDirectDependenciesError) {
console.log(
`${performance.now()}: Couldn't find a direct dependencies list for ${repoName}. Assuming indirect usage of GOV.UK Frontend.`
)
Expand Down
Loading

0 comments on commit bc8c164

Please sign in to comment.