Skip to content
This repository has been archived by the owner on Jul 25, 2023. It is now read-only.

Gitlab prov metadata #31

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
65 changes: 65 additions & 0 deletions lib/appendPipelineProv.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import rdf from 'rdf-ext'
import { Transform } from 'readable-stream'
import { checkEnvironment, provFromGitlab } from './metadata/produceProv.js'
import * as ns from './namespaces.js'

class ProvMetadata extends Transform {
constructor (context, { subjectsWithClass, graph, baseNamespace }) {
super({ objectMode: true })
this.type = subjectsWithClass
const { environment, message } = checkEnvironment()
if (message) {
context?.logger?.info(message)
}
if (environment === 'Gitlab') {
this.provPointer = provFromGitlab({ baseNamespace })
}
this.graph = graph
}

_transform (quad, encoding, callback) {
if (this.provPointer && quad.predicate.equals(ns.rdf.type) && quad.object.equals(this.type)) {
this.provPointer.addOut(ns.prov.generated, quad.subject)
this.needsProvenance = true
}

callback(null, quad)
}

async _flush (callback) {
if (this.provPointer && this.needsProvenance) {
for (const quad of [...this.provPointer.dataset]) {
if (this.graph) {
this.push(
rdf.quad(quad.subject, quad.predicate, quad.object, this.graph))
} else {
this.push(quad)
}
}
}
callback()
}
}

function toNamedNode (item) {
if (item && item.term) {
return item.term
}
return typeof item === 'string' ? rdf.namedNode(item) : item
}

function appendPipelineProv ({
subjectsWithClass, graph, baseNamespace
} = {}) {
if (!subjectsWithClass) {
throw new Error('Needs subjectsWithClass as parameter (string or namedNode)')
}

return new ProvMetadata(this, {
subjectsWithClass: toNamedNode(subjectsWithClass),
graph: toNamedNode(graph),
baseNamespace
})
}

export default appendPipelineProv
150 changes: 150 additions & 0 deletions lib/metadata/produceProv.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import namespace from '@rdfjs/namespace'
import clownface from 'clownface'
import rdf from 'rdf-ext'
import * as ns from '../namespaces.js'

const withoutLastSegment = url => url.split('/')
.splice(0, url.split('/').length - 1)
.join('/')

const lastSegment = url => url.split('/').pop()

const provz = namespace('https://barnard-prov.described.at/')

const type = rdf.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')

const requiredVars = [
'GITLAB_CI', 'CI_JOB_URL', 'CI_PROJECT_URL', 'CI_PIPELINE_URL']

function checkEnvironment () {
const isGitlabEnvironment = !!process.env.GITLAB_CI
if (isGitlabEnvironment) {
const notFound = requiredVars.filter(varName => !process.env[varName])
const message = notFound.length > 0
? `Gitlab detected, but some of the required environment variables required to generate PROV metadata were not found [${notFound}]`
: 'Gitlab detected, producing pipeline prov-metadata'
return {
environment: 'Gitlab', message
}
}
return {
environment: undefined, message: undefined
}
}

function provFromGitlab ({ baseNamespace }) {
const defaultBaseURI = `${process.env.CI_PROJECT_URL}`
const mint = url => baseNamespace
? `${baseNamespace}${url.slice(
defaultBaseURI.length)}`
: url

// The pointer to the Job, this is the one that generates artifacts
const jobUrl = process.env.CI_JOB_URL

const jobUri = rdf.namedNode(mint(jobUrl))
const pointer = clownface({ dataset: rdf.dataset(), term: jobUri })
.addOut(type, provz.Job)
.addOut(provz.hasApp, rdf.namedNode(jobUrl))
.addOut(type, ns.prov.Activity)

// Codebase
const projectUrl = process.env.CI_PROJECT_URL
const codebaseUri = rdf.namedNode(mint(projectUrl))
pointer.node(codebaseUri)
.addOut(type, provz.Codebase)
.addOut(ns.rdfs.label, lastSegment(projectUrl))
.addOut(provz.hasApp, rdf.namedNode(projectUrl))

// A Pipeline Run triggers Jobs. (download, transform etc)
const pipelineRun = process.env.CI_PIPELINE_URL
const pipelineRunUri = rdf.namedNode(mint(pipelineRun))
pointer.node(pipelineRunUri)
.addOut(ns.rdfs.label, lastSegment(pipelineRun))
.addOut(type, provz.PipelineRun)
.addOut(provz.hasApp, rdf.namedNode(pipelineRun))
.addOut(provz.hasJob, jobUri)

// all the pipelines for this specific codebase
const pipelineCollectionUrl = withoutLastSegment(pipelineRun)
const pipelineRunCollectionUri = rdf.namedNode(mint(pipelineCollectionUrl))
pointer.node(pipelineRunCollectionUri)
.addOut(type, provz.PipelineRunCollection)
.addOut(ns.rdfs.label, `${lastSegment(projectUrl)} pipelines runs`)
.addOut(provz.hasPipelineRun, pipelineRunUri)
.addOut(provz.hasApp, rdf.namedNode(pipelineCollectionUrl))

pointer.node(codebaseUri)
.addOut(provz.hasPipelineCollection, pipelineRunCollectionUri)

// Job Optionals
const jobStartTime = process.env.CI_JOB_STARTED_AT
if (jobStartTime) {
pointer.node(jobUri)
.addOut(provz.startedAtTime, rdf.literal(jobStartTime, ns.xsd.dateTime))
}

const environment = process.env.CI_BUILD_REF_SLUG
if (environment) {
const environmentUri = provz[`environment/${environment}`] // Should this be so absolute?
pointer.node(environmentUri)
.addOut(type, provz.Environment)
.addOut(provz.hasJob, jobUri)
}

// Pipeline optionals
const pipelineRunStartTime = process.env.CI_PIPELINE_CREATED_AT
if (pipelineRunStartTime) {
pointer.node(pipelineRunUri)
.addOut(ns.prov.startedAtTime,
rdf.literal(pipelineRunStartTime, ns.xsd.dateTime))
}

// Codebase optionals
const codebaseDescription = process.env.CI_PROJECT_DESCRIPTION
if (codebaseDescription) {
pointer.node(codebaseUri).addOut(ns.schema.description, codebaseDescription)
}

const codebaseName = process.env.CI_PROJECT_NAME
if (codebaseName) {
pointer.node(codebaseUri).addOut(ns.schema.name, codebaseName)
}

// Commit Optionals
const commitSha = process.env.CI_COMMIT_SHA

if (commitSha) {
const commitUrl = `${projectUrl}/-/commit/${commitSha}`
const commitUri = rdf.namedNode(mint(commitUrl))

pointer.node(jobUri).addOut(ns.prov.wasStartedBy, commitUri)
pointer.node(codebaseUri).addOut(provz.hasCommit, commitUri)

pointer.node(commitUri)
.addOut(ns.schema.name, `Commit ${commitSha}`)
.addOut(provz.hasApp, rdf.namedNode(commitUrl))
.addOut(type, provz.Commit)
.addOut(provz.triggered, pipelineRunUri)

const commitName = process.env.CI_COMMIT_TITLE
if (commitName) {
pointer.node(commitUri).addOut(ns.schema.name, commitName)
}

const commitAuthor = process.env.CI_COMMIT_AUTHOR
if (commitAuthor) {
pointer.node(commitUri).addOut(provz.author, commitAuthor)
}

const commitTime = process.env.CI_COMMIT_TIMESTAMP
if (commitTime) {
pointer.node(commitUri)
.addOut(ns.prov.atTime, rdf.literal(commitTime, ns.xsd.dateTime))
}
}

return pointer
}

export { checkEnvironment, provFromGitlab }
9 changes: 5 additions & 4 deletions lib/namespaces.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import namespace from '@rdfjs/namespace'

const cube = namespace('https://cube.link/')
const rdf = namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
const rdfs = namespace('http://www.w3.org/2000/01/rdf-schema#')
const sh = namespace('http://www.w3.org/ns/shacl#')
const xsd = namespace('http://www.w3.org/2001/XMLSchema#')
const _void = namespace('http://rdfs.org/ns/void#')
const dcat = namespace('http://www.w3.org/ns/dcat#')
const schema = namespace('http://schema.org/')
const dcterms = namespace('http://purl.org/dc/terms/')

export { cube, rdf, rdfs, sh, xsd, _void, dcat, schema, dcterms }
export { cube, rdfs, sh, xsd, _void, dcterms }
export { schema } from '@tpluscode/rdf-ns-builders'
export { dcat } from '@tpluscode/rdf-ns-builders'
export { rdf } from '@tpluscode/rdf-ns-builders'
export { prov } from '@tpluscode/rdf-ns-builders'
3 changes: 2 additions & 1 deletion metadata.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import append from './lib/append.js'
import appendPipelineProv from './lib/appendPipelineProv.js'
import voidStats from './lib/voidStats.js'

export { append, voidStats }
export { append, voidStats, appendPipelineProv }
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"@rdfjs/namespace": "^1.1.0",
"@rdfjs/term-map": "^1.0.0",
"@rdfjs/term-set": "^1.0.1",
"@tpluscode/rdf-ns-builders": "^3.0.2",
"clownface": "^1.3.0",
"file-fetch": "^1.7.0",
"lodash": "^4.17.21",
Expand Down
122 changes: 122 additions & 0 deletions test/appendPipelineProv.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import { strictEqual } from 'assert'
import namespace from '@rdfjs/namespace'
import assertThrows from 'assert-throws-async'
import getStream from 'get-stream'
import { isDuplex } from 'isstream'
import { describe, it, before, after } from 'mocha'
import rdf from 'rdf-ext'
import { Readable } from 'readable-stream'
import appendPipelineProv from '../lib/appendPipelineProv.js'
import * as ns from '../lib/namespaces.js'
import {
clearGitlabMockEnvironment, setGitlabMockEnvironment
} from './support/gitlabEnvironment.js'

const ex = namespace('http://example.org/')

describe('metadata.appendPipelineProv', () => {
it('should be a factory', () => {
strictEqual(typeof appendPipelineProv, 'function')
})

it('should throw an error if no argument is given', async () => {
await assertThrows(async () => {
await appendPipelineProv()
}, Error, /Needs subjectsWithClass as parameter/)
})

it(
'should return a duplex stream with a subjectsWithClass (namedNode) metadata parameter',
async () => {
const step = await appendPipelineProv({
subjectsWithClass: ns.dcat.Dataset
})
strictEqual(isDuplex(step), true)
})

it('should append no prov metadata with no environment variables',
async () => {
const initial = [
rdf.quad(ex.subject0, ns.rdf.type, ns.dcat.Dataset, ex.graph0)]

const step = await appendPipelineProv({
subjectsWithClass: ns.dcat.Dataset
})

const result = await getStream.array(Readable.from(initial).pipe(step))

strictEqual(result.length, 1)
strictEqual(result[0].equals(initial[0]), true)
})
})

describe('metadata.appendPipelineProv, case with Gitlab environment variables', () => {
before(setGitlabMockEnvironment)

after(clearGitlabMockEnvironment)

it(
'should append prov metadata with a a subjectsWithClass (namedNode) metadata parameter',
async () => {
const initial = [
rdf.quad(ex.subject0, ns.rdf.type, ns.dcat.Dataset, ex.graph0)]

const step = await appendPipelineProv({
subjectsWithClass: ns.dcat.Dataset
})
const result = await getStream.array(Readable.from(initial).pipe(step))

strictEqual(result.length > 1, true)
})

it(
'should append prov metadata with a subjectsWithClass (string) metadata parameter',
async () => {
const initial = [
rdf.quad(ex.subject0, ns.rdf.type, ns.dcat.Dataset, ex.graph0)]

const step = await appendPipelineProv({
subjectsWithClass: `${ns.dcat.Dataset.value}`
})

const result = await getStream.array(Readable.from(initial).pipe(step))

strictEqual(result.length > 1, true)
})

it(
'should append no prov metadata when subjectsWithClass does not match',
async () => {
const initial = [
rdf.quad(ex.subject0, ns.rdf.type, ns.dcat.Unknown, ex.graph0)]

const step = await appendPipelineProv({
subjectsWithClass: `${ns.dcat.Dataset.value}`
})

const result = await getStream.array(Readable.from(initial).pipe(step))

strictEqual(result.length === 1, true)
})

it('should append prov metadata with the specified graph', async () => {
const initial = [
rdf.quad(ex.subject0, ns.rdf.type, ns.dcat.Dataset, ex.graph0)]

const step = await appendPipelineProv({
subjectsWithClass: ns.dcat.Dataset, graph: ex.graph1
})

const result = await getStream.array(Readable.from(initial).pipe(step))

strictEqual(result.length > 1, true)

for (const [index, quad] of result.entries()) {
if (index === 0) {
strictEqual(quad.graph.equals(ex.graph0), true)
} else {
strictEqual(quad.graph.equals(ex.graph1), true)
}
}
})
})
2 changes: 2 additions & 0 deletions test/metadata.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
import './metadata/produceProv.test.js'
import './metadata/applyOptions.test.js'
Loading