-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Query API] Script dependencies (#1066)
* refactor: basic SupportedQueries registry structure * feat: allow registering an ascii summarizer for a query directly * refactor: include query schemas in query definitions * feat-fix: fixed meta queries being included in print * wip: basic setup * wip: basic data structures * refactor: const all the arrays * refactor: rename directory to fit the other names * wip: some work on the actual query * feat: libraries and sourced files dependency tracking * feat: ascii summarizer * feat: read and write function extraction * feat(query-test): dep cycle breaker and criteria id resolve * feat-fix(dep-query): support arguments :D * refactor: cleaned up dependencies query parsing * feat: allow including custom functions in dependencies query * refactor: exact name matching * refactor(dep-query): general overhaul --------- Co-authored-by: Florian Sihler <[email protected]>
- Loading branch information
1 parent
d1b0e07
commit 4e8558b
Showing
10 changed files
with
449 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
128 changes: 128 additions & 0 deletions
128
src/queries/catalog/dependencies-query/dependencies-query-executor.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import type { BasicQueryData } from '../../query'; | ||
import { executeQueries } from '../../query'; | ||
import type { | ||
DependenciesQuery, | ||
DependenciesQueryResult, DependencyInfo, | ||
FunctionInfo, | ||
LibraryInfo, | ||
ReadInfo, SourceInfo, | ||
WriteInfo | ||
} from './dependencies-query-format'; | ||
import { LibraryFunctions, ReadFunctions, SourceFunctions, WriteFunctions } from './dependencies-query-format'; | ||
import type { CallContextQuery, CallContextQueryResult } from '../call-context-query/call-context-query-format'; | ||
import type { DataflowGraphVertexFunctionCall } from '../../../dataflow/graph/vertex'; | ||
import { getReferenceOfArgument } from '../../../dataflow/graph/graph'; | ||
import { log } from '../../../util/log'; | ||
import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; | ||
import { removeRQuotes } from '../../../r-bridge/retriever'; | ||
import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; | ||
import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; | ||
|
||
const SupportedVertexTypes = [RType.String, RType.Logical, RType.Number]; | ||
|
||
const Unknown = 'unknown'; | ||
|
||
export function executeDependenciesQuery(data: BasicQueryData, queries: readonly DependenciesQuery[]): DependenciesQueryResult { | ||
if(queries.length !== 1) { | ||
log.warn('Dependencies query expects only up to one query, but got ', queries.length); | ||
} | ||
const now = Date.now(); | ||
|
||
const query = queries[0]; | ||
const ignoreDefault = query.ignoreDefaultFunctions ?? false; | ||
const libraryFunctions = getFunctionsToCheck(query.libraryFunctions, ignoreDefault, LibraryFunctions); | ||
const sourceFunctions = getFunctionsToCheck(query.sourceFunctions, ignoreDefault, SourceFunctions); | ||
const readFunctions = getFunctionsToCheck(query.readFunctions, ignoreDefault, ReadFunctions); | ||
const writeFunctions = getFunctionsToCheck(query.writeFunctions, ignoreDefault, WriteFunctions); | ||
|
||
const results = executeQueries(data, [ | ||
...makeCallContextQuery(libraryFunctions, 'library'), | ||
...makeCallContextQuery(sourceFunctions, 'source'), | ||
...makeCallContextQuery(readFunctions, 'read'), | ||
...makeCallContextQuery(writeFunctions, 'write') | ||
])['call-context']; | ||
|
||
const libraries: LibraryInfo[] = getResults(data, results, 'library', libraryFunctions, (id, vertex, argument) => ({ | ||
nodeId: id, | ||
functionName: vertex.name, | ||
libraryName: argument ?? Unknown | ||
}), [RType.Symbol]); | ||
const sourcedFiles: SourceInfo[] = getResults(data, results, 'source', sourceFunctions, (id, vertex, argument) => ({ | ||
nodeId: id, | ||
functionName: vertex.name, | ||
file: argument ?? Unknown | ||
})); | ||
const readData: ReadInfo[] = getResults(data, results, 'read', readFunctions, (id, vertex, argument) => ({ | ||
nodeId: id, | ||
functionName: vertex.name, | ||
source: argument ?? Unknown | ||
})); | ||
const writtenData: WriteInfo[] = getResults(data, results, 'write', writeFunctions, (id, vertex, argument) => ({ | ||
nodeId: id, | ||
functionName: vertex.name, | ||
// write functions that don't have argIndex are assumed to write to stdout | ||
destination: argument ?? 'stdout' | ||
})); | ||
|
||
return { | ||
'.meta': { | ||
timing: Date.now() - now | ||
}, | ||
libraries, sourcedFiles, readData, writtenData | ||
}; | ||
} | ||
|
||
function makeCallContextQuery(functions: readonly FunctionInfo[], kind: string): CallContextQuery[] { | ||
return functions.map(f => ({ | ||
type: 'call-context', | ||
callName: f.name, | ||
includeAliases: true, | ||
callNameExact: true, | ||
subkind: f.name, | ||
kind | ||
})); | ||
} | ||
|
||
function getResults<T extends DependencyInfo>(data: BasicQueryData, results: CallContextQueryResult, kind: string, functions: FunctionInfo[], makeInfo: (id: NodeId, vertex: DataflowGraphVertexFunctionCall, argument: string | undefined) => T | undefined, additionalAllowedTypes?: RType[]) { | ||
return Object.entries(results?.kinds[kind]?.subkinds ?? {}).flatMap(([name, results]) => results.map(({ id }) => { | ||
const vertex = data.graph.getVertex(id) as DataflowGraphVertexFunctionCall; | ||
const info = functions.find(f => f.name === name) as FunctionInfo; | ||
let index = info.argIdx; | ||
if(info.argName) { | ||
const arg = vertex?.args.findIndex(arg => arg !== EmptyArgument && arg.name === info.argName); | ||
if(arg >= 0) { | ||
index = arg; | ||
} | ||
} | ||
const argument = index !== undefined ? getArgumentValue(data, vertex, index, additionalAllowedTypes) : undefined; | ||
return makeInfo(id, vertex, argument); | ||
})).filter(x => x !== undefined) ?? []; | ||
} | ||
|
||
function getArgumentValue({ graph }: BasicQueryData, vertex: DataflowGraphVertexFunctionCall, argumentIndex: number, additionalAllowedTypes: RType[] | undefined): string | undefined { | ||
if(vertex && vertex.args.length > argumentIndex) { | ||
const arg = getReferenceOfArgument(vertex.args[argumentIndex]); | ||
if(arg) { | ||
let valueNode = graph.idMap?.get(arg); | ||
if(valueNode?.type === RType.Argument) { | ||
valueNode = valueNode.value; | ||
} | ||
if(valueNode) { | ||
const allowedTypes = [...SupportedVertexTypes, ...additionalAllowedTypes ?? []]; | ||
return allowedTypes.includes(valueNode.type) ? removeRQuotes(valueNode.lexeme as string) : Unknown; | ||
} | ||
} | ||
} | ||
return undefined; | ||
} | ||
|
||
function getFunctionsToCheck(customFunctions: FunctionInfo[] | undefined, ignoreDefaultFunctions: boolean, defaultFunctions: FunctionInfo[]): FunctionInfo[] { | ||
const functions: FunctionInfo[] = []; | ||
if(!ignoreDefaultFunctions) { | ||
functions.push(...defaultFunctions); | ||
} | ||
if(customFunctions) { | ||
functions.push(...customFunctions); | ||
} | ||
return functions; | ||
} |
129 changes: 129 additions & 0 deletions
129
src/queries/catalog/dependencies-query/dependencies-query-format.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import type { BaseQueryFormat, BaseQueryResult } from '../../base-query-format'; | ||
import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; | ||
import type { QueryResults, SupportedQuery } from '../../query'; | ||
import { bold } from '../../../util/ansi'; | ||
import { printAsMs } from '../../../util/time'; | ||
import Joi from 'joi'; | ||
import { executeDependenciesQuery } from './dependencies-query-executor'; | ||
|
||
// these lists are originally based on https://github.com/duncantl/CodeDepends/blob/7fd96dfee16b252e5f642c77a7ababf48e9326f8/R/codeTypes.R | ||
export const LibraryFunctions: FunctionInfo[] = [ | ||
{ name: 'library', argIdx: 0, argName: 'package' }, | ||
{ name: 'require', argIdx: 0, argName: 'package' }, | ||
{ name: 'loadNamespace', argIdx: 0, argName: 'package' }, | ||
{ name: 'attachNamespace', argIdx: 0, argName: 'ns' }, | ||
] as const; | ||
export const SourceFunctions: FunctionInfo[] = [ | ||
{ name: 'source', argIdx: 0, argName: 'file' } | ||
] as const; | ||
export const ReadFunctions: FunctionInfo[] = [ | ||
{ name: 'read.table', argIdx: 0, argName: 'file' }, | ||
{ name: 'read.csv', argIdx: 0, argName: 'file' }, | ||
{ name: 'read.csv2', argIdx: 0, argName: 'file' }, | ||
{ name: 'read.delim', argIdx: 0, argName: 'file' }, | ||
{ name: 'read.delim', argIdx: 0, argName: 'file' }, | ||
{ name: 'read.fwf', argIdx: 0, argName: 'file' }, | ||
{ name: 'file', argIdx: 1, argName: 'open' }, | ||
{ name: 'url', argIdx: 1, argName: 'open' }, | ||
{ name: 'load', argIdx: 0, argName: 'file' }, | ||
{ name: 'gzfile', argIdx: 1, argName: 'open' }, | ||
{ name: 'bzfile', argIdx: 1, argName: 'open' }, | ||
{ name: 'download.file', argIdx: 0, argName: 'url' }, | ||
{ name: 'pipe', argIdx: 1, argName: 'open' }, | ||
{ name: 'fifo', argIdx: 1, argName: 'open' }, | ||
{ name: 'unz', argIdx: 1, argName: 'open' }, | ||
{ name: 'matrix', argIdx: 0, argName: 'data' }, | ||
{ name: 'readRDS', argIdx: 0, argName: 'file' }, | ||
{ name: 'readLines', argIdx: 0, argName: 'con' }, | ||
] as const; | ||
export const WriteFunctions: FunctionInfo[] = [ | ||
{ name: 'save', argIdx: 0, argName: '...' }, | ||
{ name: 'save.image', argIdx: 0, argName: 'file' }, | ||
{ name: 'write', argIdx: 1, argName: 'file' }, | ||
{ name: 'dput', argIdx: 1, argName: 'file' }, | ||
{ name: 'dump', argIdx: 1, argName: 'file' }, | ||
{ name: 'write.table', argIdx: 1, argName: 'file' }, | ||
{ name: 'write.csv', argIdx: 1, argName: 'file' }, | ||
{ name: 'saveRDS', argIdx: 1, argName: 'file' }, | ||
// write functions that don't have argIndex are assumed to write to stdout | ||
{ name: 'print' }, | ||
{ name: 'cat' }, | ||
] as const; | ||
|
||
export interface FunctionInfo { | ||
name: string | ||
argIdx?: number | ||
argName?: string | ||
} | ||
|
||
export interface DependenciesQuery extends BaseQueryFormat { | ||
readonly type: 'dependencies' | ||
readonly ignoreDefaultFunctions?: boolean | ||
readonly libraryFunctions?: FunctionInfo[] | ||
readonly sourceFunctions?: FunctionInfo[] | ||
readonly readFunctions?: FunctionInfo[] | ||
readonly writeFunctions?: FunctionInfo[] | ||
} | ||
|
||
export interface DependenciesQueryResult extends BaseQueryResult { | ||
libraries: LibraryInfo[] | ||
sourcedFiles: SourceInfo[] | ||
readData: ReadInfo[] | ||
writtenData: WriteInfo[] | ||
} | ||
|
||
export interface DependencyInfo { | ||
nodeId: NodeId | ||
functionName: string | ||
} | ||
export type LibraryInfo = (DependencyInfo & { libraryName: 'unknown' | string }) | ||
export type SourceInfo = (DependencyInfo & { file: string }) | ||
export type ReadInfo = (DependencyInfo & { source: string }) | ||
export type WriteInfo = (DependencyInfo & { destination: 'stdout' | string }) | ||
|
||
function printResultSection<T extends DependencyInfo>(title: string, infos: T[], result: string[], sectionSpecifics: (info: T) => string): void { | ||
if(infos.length <= 0) { | ||
return; | ||
} | ||
result.push(` ╰ ${title}`); | ||
const grouped = infos.reduce(function(groups: Map<string, T[]>, i) { | ||
const array = groups.get(i.functionName); | ||
if(array) { | ||
array.push(i); | ||
} else { | ||
groups.set(i.functionName, [i]); | ||
} | ||
return groups; | ||
}, new Map<string, T[]>()); | ||
for(const [functionName, infos] of grouped) { | ||
result.push(` ╰ ${functionName}`); | ||
result.push(infos.map(i => ` ╰ Node Id: ${i.nodeId}, ${sectionSpecifics(i)}`).join('\n')); | ||
} | ||
} | ||
|
||
const functionInfoSchema: Joi.ArraySchema = Joi.array().items(Joi.object({ | ||
name: Joi.string().required().description('The name of the library function.'), | ||
argIdx: Joi.number().optional().description('The index of the argument that contains the library name.'), | ||
argName: Joi.string().optional().description('The name of the argument that contains the library name.'), | ||
})).optional(); | ||
|
||
export const DependenciesQueryDefinition = { | ||
executor: executeDependenciesQuery, | ||
asciiSummarizer: (formatter, _processed, queryResults, result) => { | ||
const out = queryResults as QueryResults<'dependencies'>['dependencies']; | ||
result.push(`Query: ${bold('dependencies', formatter)} (${printAsMs(out['.meta'].timing, 0)})`); | ||
printResultSection('Libraries', out.libraries, result, l => `Library Name: ${l.libraryName}`); | ||
printResultSection('Sourced Files', out.sourcedFiles, result, s => `Sourced File: ${s.file}`); | ||
printResultSection('Read Data', out.readData, result, r => `Source: ${r.source}`); | ||
printResultSection('Written Data', out.writtenData, result, w => `Destination: ${w.destination}`); | ||
return true; | ||
}, | ||
schema: Joi.object({ | ||
type: Joi.string().valid('dependencies').required().description('The type of the query.'), | ||
ignoreDefaultFunctions: Joi.boolean().optional().description('Should the set of functions that are detected by default be ignored/skipped?'), | ||
libraryFunctions: functionInfoSchema.description('The set of library functions to search for.'), | ||
sourceFunctions: functionInfoSchema.description('The set of source functions to search for.'), | ||
readFunctions: functionInfoSchema.description('The set of data reading functions to search for.'), | ||
writeFunctions: functionInfoSchema.description('The set of data writing functions to search for.'), | ||
}).description('The dependencies query retrieves and returns the set of all dependencies in the dataflow graph, which includes libraries, sourced files, read data, and written data.') | ||
} as const satisfies SupportedQuery<'dependencies'>; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
4e8558b
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"artificial" Benchmark Suite
Retrieve AST from R code
236.1963695
ms (98.53802331355531
)246.50067822727272
ms (108.0778534921102
)0.96
Normalize R AST
17.57016290909091
ms (32.23490712176691
)17.822801272727272
ms (32.47501104567212
)0.99
Produce dataflow information
39.29645863636363
ms (84.71955755824334
)40.41399131818182
ms (86.99830114822697
)0.97
Total per-file
807.6120788636364
ms (1459.2505344998815
)829.6500495454545
ms (1498.0426737610057
)0.97
Static slicing
2.050900625103245
ms (1.1313021626987982
)2.0971564864344034
ms (1.2085738336418979
)0.98
Reconstruct code
0.22672889927999057
ms (0.17177578486699926
)0.24421041798721546
ms (0.1931884846808244
)0.93
Total per-slice
2.2914889796247775
ms (1.1956336432867123
)2.356082671207842
ms (1.2841644653561477
)0.97
failed to reconstruct/re-parse
0
#0
#1
times hit threshold
0
#0
#1
reduction (characters)
0.7869360165281424
#0.7869360165281424
#1
reduction (normalized tokens)
0.7639690077689504
#0.7639690077689504
#1
memory (df-graph)
95.46617542613636
KiB (244.77619956879823
)95.46617542613636
KiB (244.77619956879823
)1
This comment was automatically generated by workflow using github-action-benchmark.
4e8558b
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"social-science" Benchmark Suite
Retrieve AST from R code
246.36505763999997
ms (47.1146225533251
)239.76002830000002
ms (44.476154875177215
)1.03
Normalize R AST
18.79301938
ms (14.196313372923575
)19.04045506
ms (14.770405721401682
)0.99
Produce dataflow information
58.939789020000006
ms (58.401454565614195
)74.39786466
ms (87.80796950166253
)0.79
Total per-file
7825.671137640001
ms (29322.07959020926
)7666.33215246
ms (28737.408915639426
)1.02
Static slicing
16.219829461001765
ms (44.76652076418638
)15.907723437298863
ms (43.83669809749617
)1.02
Reconstruct code
0.28892649048976327
ms (0.1594027494733059
)0.2509487217116593
ms (0.14943631432024615
)1.15
Total per-slice
16.51712281535588
ms (44.79866028788043
)16.166379499642126
ms (43.873427530614464
)1.02
failed to reconstruct/re-parse
0
#0
#1
times hit threshold
0
#0
#1
reduction (characters)
0.8712997340230448
#0.8712997340230448
#1
reduction (normalized tokens)
0.8102441553774778
#0.8102441553774778
#1
memory (df-graph)
99.8990234375
KiB (113.72812769327498
)99.8990234375
KiB (113.72812769327498
)1
This comment was automatically generated by workflow using github-action-benchmark.