Skip to content

Commit

Permalink
[Query API] Script dependencies (#1066)
Browse files Browse the repository at this point in the history
* refactor: basic SupportedQueries registry structure

* feat: allow registering an ascii summarizer for a query directly

* refactor: include query schemas in query definitions

* feat-fix: fixed meta queries being included in print

* wip: basic setup

* wip: basic data structures

* refactor: const all the arrays

* refactor: rename directory to fit the other names

* wip: some work on the actual query

* feat: libraries and sourced files dependency tracking

* feat: ascii summarizer

* feat: read and write function extraction

* feat(query-test): dep cycle breaker and criteria id resolve

* feat-fix(dep-query): support arguments :D

* refactor: cleaned up dependencies query parsing

* feat: allow including custom functions in dependencies query

* refactor: exact name matching

* refactor(dep-query): general overhaul

---------

Co-authored-by: Florian Sihler <[email protected]>
  • Loading branch information
Ellpeck and EagleoutIce authored Oct 15, 2024
1 parent d1b0e07 commit 4e8558b
Show file tree
Hide file tree
Showing 10 changed files with 449 additions and 18 deletions.
4 changes: 2 additions & 2 deletions src/cli/repl/commands/repl-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ import { splitAtEscapeSensitive } from '../../../util/args';
import { italic } from '../../../util/ansi';
import { describeSchema } from '../../../util/schema';
import type { Query, QueryResults, SupportedQueryTypes } from '../../../queries/query';
import { executeQueries } from '../../../queries/query';

import { executeQueries } from '../../../queries/query';
import type { PipelineOutput } from '../../../core/steps/pipeline/pipeline';
import { jsonReplacer } from '../../../util/json';
import { AnyQuerySchema, QueriesSchema } from '../../../queries/query-schema';


async function getDataflow(shell: RShell, remainingLine: string) {
return await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, {
shell,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { RType } from '../../../../../../r-bridge/lang-4.x/ast/model/type';
import { overwriteEnvironment } from '../../../../../environments/overwrite';
import type { NoInfo } from '../../../../../../r-bridge/lang-4.x/ast/model/model';
import { expensiveTrace } from '../../../../../../util/log';
import fs from 'fs';

let sourceProvider = requestProviderFromFile();

Expand Down Expand Up @@ -76,6 +77,14 @@ export function processSourceCall<OtherInfo>(
}

export function sourceRequest<OtherInfo>(rootId: NodeId, request: RParseRequest, data: DataflowProcessorInformation<OtherInfo & ParentInformation>, information: DataflowInformation, getId: IdGenerator<NoInfo>): DataflowInformation {
if(request.request === 'file') {
/* check if the file exists and if not, fail */
if(!fs.existsSync(request.content)) {
dataflowLogger.warn(`Failed to analyze sourced file ${JSON.stringify(request)}: file does not exist`);
information.graph.markIdForUnknownSideEffects(rootId);
return information;
}
}
const executor = new RShellExecutor();

// parse, normalize and dataflow the sourced file
Expand Down
1 change: 0 additions & 1 deletion src/documentation/doc-util/doc-query.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import type { RShell } from '../../r-bridge/shell';
import type { Queries, QueryResults, SupportedQueryTypes } from '../../queries/query';
import { SupportedQueries , executeQueries } from '../../queries/query';

import { PipelineExecutor } from '../../core/pipeline-executor';
import { DEFAULT_DATAFLOW_PIPELINE } from '../../core/steps/pipeline/default-pipelines';
import { requestFromInput } from '../../r-bridge/retriever';
Expand Down
128 changes: 128 additions & 0 deletions src/queries/catalog/dependencies-query/dependencies-query-executor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import type { BasicQueryData } from '../../query';
import { executeQueries } from '../../query';
import type {
DependenciesQuery,
DependenciesQueryResult, DependencyInfo,
FunctionInfo,
LibraryInfo,
ReadInfo, SourceInfo,
WriteInfo
} from './dependencies-query-format';
import { LibraryFunctions, ReadFunctions, SourceFunctions, WriteFunctions } from './dependencies-query-format';
import type { CallContextQuery, CallContextQueryResult } from '../call-context-query/call-context-query-format';
import type { DataflowGraphVertexFunctionCall } from '../../../dataflow/graph/vertex';
import { getReferenceOfArgument } from '../../../dataflow/graph/graph';
import { log } from '../../../util/log';
import { RType } from '../../../r-bridge/lang-4.x/ast/model/type';
import { removeRQuotes } from '../../../r-bridge/retriever';
import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call';
import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id';

const SupportedVertexTypes = [RType.String, RType.Logical, RType.Number];

const Unknown = 'unknown';

export function executeDependenciesQuery(data: BasicQueryData, queries: readonly DependenciesQuery[]): DependenciesQueryResult {
if(queries.length !== 1) {
log.warn('Dependencies query expects only up to one query, but got ', queries.length);
}
const now = Date.now();

const query = queries[0];
const ignoreDefault = query.ignoreDefaultFunctions ?? false;
const libraryFunctions = getFunctionsToCheck(query.libraryFunctions, ignoreDefault, LibraryFunctions);
const sourceFunctions = getFunctionsToCheck(query.sourceFunctions, ignoreDefault, SourceFunctions);
const readFunctions = getFunctionsToCheck(query.readFunctions, ignoreDefault, ReadFunctions);
const writeFunctions = getFunctionsToCheck(query.writeFunctions, ignoreDefault, WriteFunctions);

const results = executeQueries(data, [
...makeCallContextQuery(libraryFunctions, 'library'),
...makeCallContextQuery(sourceFunctions, 'source'),
...makeCallContextQuery(readFunctions, 'read'),
...makeCallContextQuery(writeFunctions, 'write')
])['call-context'];

const libraries: LibraryInfo[] = getResults(data, results, 'library', libraryFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
libraryName: argument ?? Unknown
}), [RType.Symbol]);
const sourcedFiles: SourceInfo[] = getResults(data, results, 'source', sourceFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
file: argument ?? Unknown
}));
const readData: ReadInfo[] = getResults(data, results, 'read', readFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
source: argument ?? Unknown
}));
const writtenData: WriteInfo[] = getResults(data, results, 'write', writeFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
// write functions that don't have argIndex are assumed to write to stdout
destination: argument ?? 'stdout'
}));

return {
'.meta': {
timing: Date.now() - now
},
libraries, sourcedFiles, readData, writtenData
};
}

function makeCallContextQuery(functions: readonly FunctionInfo[], kind: string): CallContextQuery[] {
return functions.map(f => ({
type: 'call-context',
callName: f.name,
includeAliases: true,
callNameExact: true,
subkind: f.name,
kind
}));
}

function getResults<T extends DependencyInfo>(data: BasicQueryData, results: CallContextQueryResult, kind: string, functions: FunctionInfo[], makeInfo: (id: NodeId, vertex: DataflowGraphVertexFunctionCall, argument: string | undefined) => T | undefined, additionalAllowedTypes?: RType[]) {
return Object.entries(results?.kinds[kind]?.subkinds ?? {}).flatMap(([name, results]) => results.map(({ id }) => {
const vertex = data.graph.getVertex(id) as DataflowGraphVertexFunctionCall;
const info = functions.find(f => f.name === name) as FunctionInfo;
let index = info.argIdx;
if(info.argName) {
const arg = vertex?.args.findIndex(arg => arg !== EmptyArgument && arg.name === info.argName);
if(arg >= 0) {
index = arg;
}
}
const argument = index !== undefined ? getArgumentValue(data, vertex, index, additionalAllowedTypes) : undefined;
return makeInfo(id, vertex, argument);
})).filter(x => x !== undefined) ?? [];
}

function getArgumentValue({ graph }: BasicQueryData, vertex: DataflowGraphVertexFunctionCall, argumentIndex: number, additionalAllowedTypes: RType[] | undefined): string | undefined {
if(vertex && vertex.args.length > argumentIndex) {
const arg = getReferenceOfArgument(vertex.args[argumentIndex]);
if(arg) {
let valueNode = graph.idMap?.get(arg);
if(valueNode?.type === RType.Argument) {
valueNode = valueNode.value;
}
if(valueNode) {
const allowedTypes = [...SupportedVertexTypes, ...additionalAllowedTypes ?? []];
return allowedTypes.includes(valueNode.type) ? removeRQuotes(valueNode.lexeme as string) : Unknown;
}
}
}
return undefined;
}

function getFunctionsToCheck(customFunctions: FunctionInfo[] | undefined, ignoreDefaultFunctions: boolean, defaultFunctions: FunctionInfo[]): FunctionInfo[] {
const functions: FunctionInfo[] = [];
if(!ignoreDefaultFunctions) {
functions.push(...defaultFunctions);
}
if(customFunctions) {
functions.push(...customFunctions);
}
return functions;
}
129 changes: 129 additions & 0 deletions src/queries/catalog/dependencies-query/dependencies-query-format.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import type { BaseQueryFormat, BaseQueryResult } from '../../base-query-format';
import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id';
import type { QueryResults, SupportedQuery } from '../../query';
import { bold } from '../../../util/ansi';
import { printAsMs } from '../../../util/time';
import Joi from 'joi';
import { executeDependenciesQuery } from './dependencies-query-executor';

// these lists are originally based on https://github.com/duncantl/CodeDepends/blob/7fd96dfee16b252e5f642c77a7ababf48e9326f8/R/codeTypes.R
export const LibraryFunctions: FunctionInfo[] = [
{ name: 'library', argIdx: 0, argName: 'package' },
{ name: 'require', argIdx: 0, argName: 'package' },
{ name: 'loadNamespace', argIdx: 0, argName: 'package' },
{ name: 'attachNamespace', argIdx: 0, argName: 'ns' },
] as const;
export const SourceFunctions: FunctionInfo[] = [
{ name: 'source', argIdx: 0, argName: 'file' }
] as const;
export const ReadFunctions: FunctionInfo[] = [
{ name: 'read.table', argIdx: 0, argName: 'file' },
{ name: 'read.csv', argIdx: 0, argName: 'file' },
{ name: 'read.csv2', argIdx: 0, argName: 'file' },
{ name: 'read.delim', argIdx: 0, argName: 'file' },
{ name: 'read.delim', argIdx: 0, argName: 'file' },
{ name: 'read.fwf', argIdx: 0, argName: 'file' },
{ name: 'file', argIdx: 1, argName: 'open' },
{ name: 'url', argIdx: 1, argName: 'open' },
{ name: 'load', argIdx: 0, argName: 'file' },
{ name: 'gzfile', argIdx: 1, argName: 'open' },
{ name: 'bzfile', argIdx: 1, argName: 'open' },
{ name: 'download.file', argIdx: 0, argName: 'url' },
{ name: 'pipe', argIdx: 1, argName: 'open' },
{ name: 'fifo', argIdx: 1, argName: 'open' },
{ name: 'unz', argIdx: 1, argName: 'open' },
{ name: 'matrix', argIdx: 0, argName: 'data' },
{ name: 'readRDS', argIdx: 0, argName: 'file' },
{ name: 'readLines', argIdx: 0, argName: 'con' },
] as const;
export const WriteFunctions: FunctionInfo[] = [
{ name: 'save', argIdx: 0, argName: '...' },
{ name: 'save.image', argIdx: 0, argName: 'file' },
{ name: 'write', argIdx: 1, argName: 'file' },
{ name: 'dput', argIdx: 1, argName: 'file' },
{ name: 'dump', argIdx: 1, argName: 'file' },
{ name: 'write.table', argIdx: 1, argName: 'file' },
{ name: 'write.csv', argIdx: 1, argName: 'file' },
{ name: 'saveRDS', argIdx: 1, argName: 'file' },
// write functions that don't have argIndex are assumed to write to stdout
{ name: 'print' },
{ name: 'cat' },
] as const;

export interface FunctionInfo {
name: string
argIdx?: number
argName?: string
}

export interface DependenciesQuery extends BaseQueryFormat {
readonly type: 'dependencies'
readonly ignoreDefaultFunctions?: boolean
readonly libraryFunctions?: FunctionInfo[]
readonly sourceFunctions?: FunctionInfo[]
readonly readFunctions?: FunctionInfo[]
readonly writeFunctions?: FunctionInfo[]
}

export interface DependenciesQueryResult extends BaseQueryResult {
libraries: LibraryInfo[]
sourcedFiles: SourceInfo[]
readData: ReadInfo[]
writtenData: WriteInfo[]
}

export interface DependencyInfo {
nodeId: NodeId
functionName: string
}
export type LibraryInfo = (DependencyInfo & { libraryName: 'unknown' | string })
export type SourceInfo = (DependencyInfo & { file: string })
export type ReadInfo = (DependencyInfo & { source: string })
export type WriteInfo = (DependencyInfo & { destination: 'stdout' | string })

function printResultSection<T extends DependencyInfo>(title: string, infos: T[], result: string[], sectionSpecifics: (info: T) => string): void {
if(infos.length <= 0) {
return;
}
result.push(` ╰ ${title}`);
const grouped = infos.reduce(function(groups: Map<string, T[]>, i) {
const array = groups.get(i.functionName);
if(array) {
array.push(i);
} else {
groups.set(i.functionName, [i]);
}
return groups;
}, new Map<string, T[]>());
for(const [functionName, infos] of grouped) {
result.push(` ╰ ${functionName}`);
result.push(infos.map(i => ` ╰ Node Id: ${i.nodeId}, ${sectionSpecifics(i)}`).join('\n'));
}
}

const functionInfoSchema: Joi.ArraySchema = Joi.array().items(Joi.object({
name: Joi.string().required().description('The name of the library function.'),
argIdx: Joi.number().optional().description('The index of the argument that contains the library name.'),
argName: Joi.string().optional().description('The name of the argument that contains the library name.'),
})).optional();

export const DependenciesQueryDefinition = {
executor: executeDependenciesQuery,
asciiSummarizer: (formatter, _processed, queryResults, result) => {
const out = queryResults as QueryResults<'dependencies'>['dependencies'];
result.push(`Query: ${bold('dependencies', formatter)} (${printAsMs(out['.meta'].timing, 0)})`);
printResultSection('Libraries', out.libraries, result, l => `Library Name: ${l.libraryName}`);
printResultSection('Sourced Files', out.sourcedFiles, result, s => `Sourced File: ${s.file}`);
printResultSection('Read Data', out.readData, result, r => `Source: ${r.source}`);
printResultSection('Written Data', out.writtenData, result, w => `Destination: ${w.destination}`);
return true;
},
schema: Joi.object({
type: Joi.string().valid('dependencies').required().description('The type of the query.'),
ignoreDefaultFunctions: Joi.boolean().optional().description('Should the set of functions that are detected by default be ignored/skipped?'),
libraryFunctions: functionInfoSchema.description('The set of library functions to search for.'),
sourceFunctions: functionInfoSchema.description('The set of source functions to search for.'),
readFunctions: functionInfoSchema.description('The set of data reading functions to search for.'),
writeFunctions: functionInfoSchema.description('The set of data writing functions to search for.'),
}).description('The dependencies query retrieves and returns the set of all dependencies in the dataflow graph, which includes libraries, sourced files, read data, and written data.')
} as const satisfies SupportedQuery<'dependencies'>;
4 changes: 2 additions & 2 deletions src/queries/catalog/lineage-query/lineage-query-format.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export const LineageQueryDefinition = {
return true;
},
schema: Joi.object({
type: Joi.string().valid('lineage').required().description('The type of the query.'),
id: Joi.string().required().description('The ID of the node to get the lineage of.')
type: Joi.string().valid('lineage').required().description('The type of the query.'),
criterion: Joi.string().required().description('The slicing criterion of the node to get the lineage of.')
}).description('Lineage query used to find the lineage of a node in the dataflow graph')
} as const satisfies SupportedQuery<'lineage'>;
22 changes: 13 additions & 9 deletions src/queries/query.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { CallContextQuery } from './catalog/call-context-query/call-context-query-format';
import type {
CallContextQuery
} from './catalog/call-context-query/call-context-query-format';
import { CallContextQueryDefinition } from './catalog/call-context-query/call-context-query-format';

import type { DataflowGraph } from '../dataflow/graph/graph';
import type { BaseQueryFormat, BaseQueryResult } from './base-query-format';
import { guard } from '../util/assert';
Expand All @@ -15,18 +16,20 @@ import type { IdMapQuery } from './catalog/id-map-query/id-map-query-format';
import { IdMapQueryDefinition } from './catalog/id-map-query/id-map-query-format';
import type { NormalizedAstQuery } from './catalog/normalized-ast-query/normalized-ast-query-format';
import { NormalizedAstQueryDefinition } from './catalog/normalized-ast-query/normalized-ast-query-format';
import type { DataflowClusterQuery } from './catalog/cluster-query/cluster-query-format';
import { ClusterQueryDefinition } from './catalog/cluster-query/cluster-query-format';
import type { StaticSliceQuery } from './catalog/static-slice-query/static-slice-query-format';
import { StaticSliceQueryDefinition } from './catalog/static-slice-query/static-slice-query-format';
import type { LineageQuery } from './catalog/lineage-query/lineage-query-format';
import { LineageQueryDefinition } from './catalog/lineage-query/lineage-query-format';
import { type OutputFormatter } from '../util/ansi';
import type { StaticSliceQuery } from './catalog/static-slice-query/static-slice-query-format';
import { StaticSliceQueryDefinition } from './catalog/static-slice-query/static-slice-query-format';
import type { DataflowClusterQuery } from './catalog/cluster-query/cluster-query-format';
import { ClusterQueryDefinition } from './catalog/cluster-query/cluster-query-format';
import type { DependenciesQuery } from './catalog/dependencies-query/dependencies-query-format';
import { DependenciesQueryDefinition } from './catalog/dependencies-query/dependencies-query-format';
import type { OutputFormatter } from '../util/ansi';
import type { PipelineOutput } from '../core/steps/pipeline/pipeline';
import type { DEFAULT_DATAFLOW_PIPELINE } from '../core/steps/pipeline/default-pipelines';
import type Joi from 'joi';

export type Query = CallContextQuery | DataflowQuery | NormalizedAstQuery | IdMapQuery | DataflowClusterQuery | StaticSliceQuery | LineageQuery;
export type Query = CallContextQuery | DataflowQuery | NormalizedAstQuery | IdMapQuery | DataflowClusterQuery | StaticSliceQuery | LineageQuery | DependenciesQuery;

export type QueryArgumentsWithType<QueryType extends BaseQueryFormat['type']> = Query & { type: QueryType };

Expand Down Expand Up @@ -55,7 +58,8 @@ export const SupportedQueries = {
'normalized-ast': NormalizedAstQueryDefinition,
'dataflow-cluster': ClusterQueryDefinition,
'static-slice': StaticSliceQueryDefinition,
'lineage': LineageQueryDefinition
'lineage': LineageQueryDefinition,
'dependencies': DependenciesQueryDefinition
} as const satisfies SupportedQueries;

export type SupportedQueryTypes = keyof typeof SupportedQueries;
Expand Down
Loading

2 comments on commit 4e8558b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"artificial" Benchmark Suite

Benchmark suite Current: 4e8558b Previous: 96e933e Ratio
Retrieve AST from R code 236.1963695 ms (98.53802331355531) 246.50067822727272 ms (108.0778534921102) 0.96
Normalize R AST 17.57016290909091 ms (32.23490712176691) 17.822801272727272 ms (32.47501104567212) 0.99
Produce dataflow information 39.29645863636363 ms (84.71955755824334) 40.41399131818182 ms (86.99830114822697) 0.97
Total per-file 807.6120788636364 ms (1459.2505344998815) 829.6500495454545 ms (1498.0426737610057) 0.97
Static slicing 2.050900625103245 ms (1.1313021626987982) 2.0971564864344034 ms (1.2085738336418979) 0.98
Reconstruct code 0.22672889927999057 ms (0.17177578486699926) 0.24421041798721546 ms (0.1931884846808244) 0.93
Total per-slice 2.2914889796247775 ms (1.1956336432867123) 2.356082671207842 ms (1.2841644653561477) 0.97
failed to reconstruct/re-parse 0 # 0 # 1
times hit threshold 0 # 0 # 1
reduction (characters) 0.7869360165281424 # 0.7869360165281424 # 1
reduction (normalized tokens) 0.7639690077689504 # 0.7639690077689504 # 1
memory (df-graph) 95.46617542613636 KiB (244.77619956879823) 95.46617542613636 KiB (244.77619956879823) 1

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"social-science" Benchmark Suite

Benchmark suite Current: 4e8558b Previous: 96e933e Ratio
Retrieve AST from R code 246.36505763999997 ms (47.1146225533251) 239.76002830000002 ms (44.476154875177215) 1.03
Normalize R AST 18.79301938 ms (14.196313372923575) 19.04045506 ms (14.770405721401682) 0.99
Produce dataflow information 58.939789020000006 ms (58.401454565614195) 74.39786466 ms (87.80796950166253) 0.79
Total per-file 7825.671137640001 ms (29322.07959020926) 7666.33215246 ms (28737.408915639426) 1.02
Static slicing 16.219829461001765 ms (44.76652076418638) 15.907723437298863 ms (43.83669809749617) 1.02
Reconstruct code 0.28892649048976327 ms (0.1594027494733059) 0.2509487217116593 ms (0.14943631432024615) 1.15
Total per-slice 16.51712281535588 ms (44.79866028788043) 16.166379499642126 ms (43.873427530614464) 1.02
failed to reconstruct/re-parse 0 # 0 # 1
times hit threshold 0 # 0 # 1
reduction (characters) 0.8712997340230448 # 0.8712997340230448 # 1
reduction (normalized tokens) 0.8102441553774778 # 0.8102441553774778 # 1
memory (df-graph) 99.8990234375 KiB (113.72812769327498) 99.8990234375 KiB (113.72812769327498) 1

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.