From 9bdd5e1c05179d3fcd1bbd44658dfd228887e535 Mon Sep 17 00:00:00 2001 From: Le Roux Bodenstein Date: Tue, 11 Apr 2023 14:08:27 +0100 Subject: [PATCH] chore: extract the analyser from the stream (#185) * extract the analyzer from the stream * increase timeout to reduce flakyness --- package.json | 2 +- src/index.ts | 4 +- src/schema-analyzer.ts | 427 ++++++++++++++++++ src/stats.ts | 2 +- src/stream.ts | 422 +---------------- test/array-object-types.test.ts | 2 +- test/basic-embedded-array.test.ts | 2 +- test/basic-embedded-documents.test.ts | 2 +- test/basic-probability.test.ts | 2 +- test/basic-unique.test.ts | 2 +- test/basic.test.ts | 2 +- test/mixed-type-evolving-schema.test.ts | 2 +- test/mixed-type-nested.test.ts | 2 +- test/mixed-type-order.test.ts | 2 +- test/mixed-type-probability.test.ts | 2 +- test/nested-document-path.test.ts | 2 +- test/promise.test.ts | 2 +- ...sion-strings-have-same-probability.test.ts | 2 +- test/semantic-types.test.ts | 2 +- test/stats.test.ts | 2 +- test/stream.test.ts | 2 +- 21 files changed, 469 insertions(+), 420 deletions(-) create mode 100644 src/schema-analyzer.ts diff --git a/package.json b/package.json index f54fd50..cc209c8 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,7 @@ ".esm-wrapper.mjs" ], "scripts": { - "test": "nyc mocha --colors -r ts-node/register test/*.ts", + "test": "nyc mocha --timeout 5000 --colors -r ts-node/register test/*.ts", "test-example-parse-from-file": "ts-node examples/parse-from-file.ts", "test-example-parse-schema": "ts-node examples/parse-schema.ts", "build": "npm run compile-ts && gen-esm-wrapper . ./.esm-wrapper.mjs", diff --git a/src/index.ts b/src/index.ts index 254ad68..06b9952 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,7 +3,8 @@ import { pipeline as callbackPipeline, Readable, PassThrough } from 'stream'; import { promisify } from 'util'; import stream from './stream'; -import type { SchemaParseOptions, Schema, SchemaField } from './stream'; +import { SchemaAnalyzer } from './schema-analyzer'; +import type { SchemaParseOptions, Schema, SchemaField } from './schema-analyzer'; import * as schemaStats from './stats'; type MongoDBCursor = AggregationCursor | FindCursor; @@ -53,5 +54,6 @@ export type { Schema, SchemaField }; export { stream, + SchemaAnalyzer, schemaStats }; diff --git a/src/schema-analyzer.ts b/src/schema-analyzer.ts new file mode 100644 index 0000000..8ad2e85 --- /dev/null +++ b/src/schema-analyzer.ts @@ -0,0 +1,427 @@ +import _ from 'lodash'; + +import Reservoir from 'reservoir'; + +import semanticTypeRegisters from './semantic-types'; + +import type { + Document, + ObjectId, + MinKey, + MaxKey, + Long, + Double, + Int32, + Decimal128, + Binary, + BSONRegExp, + Code, + BSONSymbol, + Timestamp +} from 'bson'; + +type BaseSchemaType = { + path: string; + count: number; + probability: number; + has_duplicates: boolean; + unique: number; +} + +type ConstantSchemaType = BaseSchemaType & { + name: 'Null' | 'Undefined'; +} + +type TypeCastMap = { + Array: unknown[]; + Binary: Binary; + Boolean: boolean; + Code: Code; + Date: Date; + Decimal128: Decimal128; + Double: Double; + Int32: Int32; + Int64: Long; + MaxKey: MaxKey; + MinKey: MinKey; + Null: null; + Object: Record; + ObjectId: ObjectId; + BSONRegExp: BSONRegExp; + String: string; + BSONSymbol: BSONSymbol; + Timestamp: Timestamp; + Undefined: undefined; +}; + +type TypeCastTypes = keyof TypeCastMap; +type BSONValue = TypeCastMap[TypeCastTypes]; + +export type PrimitiveSchemaType = BaseSchemaType & { + name: 'String' | 'Number' | 'Int32' | 'Boolean' | 'Decimal128' | 'Long' | 'ObjectId' | 'Date' | 'RegExp' | 'Symbol' | 'MaxKey' | 'MinKey' | 'Binary' | 'Code' | 'Timestamp' | 'DBRef'; + values: BSONValue[]; +} + +export type ArraySchemaType = BaseSchemaType & { + name: 'Array'; + lengths: number[]; + average_length: number; + total_count: number; + // eslint-disable-next-line no-use-before-define + types: SchemaType[]; +} + +export type DocumentSchemaType = BaseSchemaType & { + name: 'Document'; + // eslint-disable-next-line no-use-before-define + fields: SchemaField[]; +} + +export type SchemaType = ConstantSchemaType | PrimitiveSchemaType | ArraySchemaType | DocumentSchemaType; + +export type SchemaField = { + name: string; + count: number; + path: string; + type: string | string[]; + probability: number; + has_duplicates: boolean; + types: SchemaType[]; +}; + +export type Schema = { + count: number; + fields: SchemaField[] +} + +// Used when building the end Schema. +type SchemaBuildingMap = { + [typeName: string]: { + name: string; + path: string; + count: number; + lengths?: number[]; + average_length?: number; + total_count?: number; + types?: SchemaBuildingMap; + + fields?: SchemaBuildingMap; + + values?: { + // Reservoir type. + pushSome: (value: any) => void; + }; + } +} + +type SemanticTypeFunction = ((value: string, path?: string) => boolean); +type SemanticTypeMap = { + [typeName: string]: SemanticTypeFunction | boolean; +}; + +export type SchemaParseOptions = { + semanticTypes?: boolean | SemanticTypeMap; + storeValues?: boolean; +}; + +export type RootSchema = { + fields: SchemaBuildingMap; + count: number; +} + +/** +* Extracts a Javascript string from a BSON type to compute unique values. +*/ +function extractStringValueFromBSON(value: any): string { + if (value && value._bsontype) { + if (['Decimal128', 'Long'].includes(value._bsontype)) { + return value.toString(); + } + if (['Double', 'Int32'].includes(value._bsontype)) { + return String(value.value); + } + } + if (typeof value === 'string') { + return value; + } + return String(value); +} + +function fieldComparator(a: SchemaField, b: SchemaField) { + // Make sure _id is always at top, even in presence of uppercase fields. + const aName = a.name; + const bName = b.name; + if (aName === '_id') { + return -1; + } + if (bName === '_id') { + return 1; + } + // Otherwise sort case-insensitively. + return aName.toLowerCase() < bName.toLowerCase() ? -1 : 1; +} + +/** + * Final pass through the result to add missing information: + * - compute `probability`, `unique`, `has_duplicates` and + * `average_length` fields + * - add `Undefined` pseudo-types + * - collapse `type` arrays to single string if length 1 + * - turns fields and types objects into arrays to conform with original + * schema parser + * This mutates the passed in schema. + */ +function finalizeSchema(schema: any, parent?: any, tag?: 'fields' | 'types'): void { + if (schema === undefined) { + return; + } + + if (tag === undefined) { + // Recursively finalize fields. + // debug('recursively calling schema.fields'); + finalizeSchema(schema.fields, schema, 'fields'); + } + + if (tag === 'fields') { + Object.values(schema).forEach((field: any) => { + // Create `Undefined` pseudo-type. + const missing = parent.count - field.count; + if (missing > 0) { + field.types.Undefined = { + name: 'Undefined', + type: 'Undefined', + path: field.path, + count: missing + }; + } + field.total_count = Object.values(field.types) + .map((v: any) => v.count) + .reduce((p, c) => p + c, 0); + + // Recursively finalize types. + finalizeSchema(field.types, field, 'types'); + field.type = field.types.map((v: SchemaField) => v.name); + if (field.type.length === 1) { + field.type = field.type[0]; + } + + // A field has duplicates when any of its types have duplicates. + field.has_duplicates = !!field.types.find((v: SchemaField) => v.has_duplicates); + + // Compute probability. + field.probability = field.count / parent.count; + }); + + // turn object into array + parent.fields = Object.values(parent.fields as SchemaField[]).sort(fieldComparator); + } + + if (tag === 'types') { + Object.values(schema).forEach((type: any) => { + type.total_count = (type.lengths || []).reduce((p: number, c: number) => p + c || 0, 0); + + // debug('recursively calling schema.fields'); + finalizeSchema(type.fields, type, 'fields'); + + // debug('recursively calling schema.types'); + finalizeSchema(type.types, type, 'types'); + + // compute `probability` for each type + type.probability = type.count / (parent.total_count || parent.count); + + // compute `unique` and `has_duplicates` for each type + if (type.name === 'Null' || type.name === 'Undefined') { + delete type.values; + type.unique = type.count === 0 ? 0 : 1; + type.has_duplicates = type.count > 1; + } else if (type.values) { + type.unique = new Set(type.values.map(extractStringValueFromBSON)).size; + type.has_duplicates = type.unique !== type.values.length; + } + + // compute `average_length` for array types + if (type.lengths) { + type.average_length = type.total_count / type.lengths.length; + } + // recursively finalize fields and types + }); + + parent.types = Object.values(parent.types as SchemaType[]).sort((a, b) => b.probability - a.probability); + } +} + +/** + * Returns the type of value as a string. BSON type aware. Replaces `Object` + * with `Document` to avoid naming conflicts with javascript Objects. + */ +function getBSONType(value: any): string { + let T; + if (value && value._bsontype) { + T = value._bsontype; + } else { + T = Object.prototype.toString.call(value).replace(/^\[object (\w+)\]$/, '$1'); + } + if (T === 'Object') { + T = 'Document'; + } + return T; +} + +/** + * Handles adding the value to the value reservoir. Will also crop + * strings at 10,000 characters. + * + * @param {Object} type the type object from `addToType` + * @param {Any} value the value to be added to `type.values` + */ +function addToValue(type: SchemaBuildingMap[string], value: any) { + if (type.name === 'String') { + // Crop strings at 10k characters, + if (value.length > 10000) { + value = value.charCodeAt(10000 - 1) === value.codePointAt(10000 - 1) + ? value.slice(0, 10000) + : value.slice(0, 10000 - 1); + } + } + type.values!.pushSome(value); +} + +export class SchemaAnalyzer { + finalized: boolean; + semanticTypes: SemanticTypeMap; + rootSchema: RootSchema; + options: SchemaParseOptions; + + constructor(options?: SchemaParseOptions) { + // Set default options. + this.options = { semanticTypes: false, storeValues: true, ...options }; + + this.finalized = false; + + this.semanticTypes = { + ...semanticTypeRegisters + }; + + if (typeof this.options.semanticTypes === 'object') { + // enable existing types that evaluate to true + const enabledTypes = Object.entries(this.options.semanticTypes) + .filter(([, v]) => typeof v === 'boolean' && v) + .map(([k]) => k.toLowerCase()); + + this.semanticTypes = { + ...Object.entries(this.semanticTypes) + .filter(([k]) => enabledTypes.includes(k.toLowerCase())) + .reduce((p, [k, v]) => ({ ...p, [k]: v }), {}) + }; + + Object.entries(this.options.semanticTypes) + .filter(([, v]) => typeof v === 'function') + .forEach(([k, v]) => { + this.semanticTypes[k] = v; + }); + } + + this.rootSchema = { + fields: {}, + count: 0 + }; + } + + getSemanticType(value: any, path: string) { + // Pass value to semantic type detectors, return first match or undefined. + + const returnValue = Object.entries(this.semanticTypes) + .filter(([, v]) => { + return (v as SemanticTypeFunction)(value, path); + }) + .map(([k]) => k)[0]; + + return returnValue; + } + + /** + * Takes a field value, determines the correct type, handles recursion into + * nested arrays and documents, and passes the value down to `addToValue`. + * + * @param {String} path field path in dot notation + * @param {Any} value value of the field + * @param {Object} schema the updated schema object + */ + + addToType(path: string, value: any, schema: SchemaBuildingMap) { + const bsonType = getBSONType(value); + // If semantic type detection is enabled, the type is the semantic type + // or the original bson type if no semantic type was detected. If disabled, + // it is always the bson type. + const typeName = (this.options.semanticTypes) ? this.getSemanticType(value, path) || bsonType : bsonType; + const type = schema[typeName] = _.get(schema, typeName, { + name: typeName, + bsonType: bsonType, + path: path, + count: 0 + } as SchemaBuildingMap[string]); + type.count++; + + // Recurse into arrays by calling `addToType` for each element. + if (typeName === 'Array') { + type.types = type.types ?? {}; + type.lengths = type.lengths || []; + type.lengths.push(value.length); + value.forEach((v: SchemaBuildingMap) => this.addToType(path, v, type.types!)); + + // Recurse into nested documents by calling `addToField` for all sub-fields. + } else if (typeName === 'Document') { + type.fields = _.get(type, 'fields', {}); + Object.entries(value).forEach(([k, v]) => this.addToField(`${path}.${k}`, v, type.fields!)); + + // If the `storeValues` option is enabled, store some example values. + } else if (this.options.storeValues) { + const defaultValue = bsonType === 'String' + ? Reservoir(100) : Reservoir(10000); + type.values = type.values || defaultValue; + + addToValue(type, value); + } + } + + /** + * Handles a field from a document. Passes the value to `addToType`. + * + * @param {String} path field path in dot notation + * @param {Any} value value of the field + * @param {Object} schema the updated schema object + */ + addToField(path: string, value: any, schema: SchemaBuildingMap) { + const pathSplitOnDot = path.split('.'); + const defaults = { + [path]: { + name: pathSplitOnDot[pathSplitOnDot.length - 1], + path: path, + count: 0, + types: {} + } + }; + _.defaultsDeep(schema, defaults); + const field = schema[path]; + + field.count++; + // debug('added to field', field); + this.addToType(path, value, field.types!); + } + + analyzeDoc(doc: Document) { + for (const key of Object.keys(doc)) { + this.addToField(key, doc[key], this.rootSchema.fields); + } + this.rootSchema.count += 1; + } + + getResult(): RootSchema { + if (!this.finalized) { + finalizeSchema(this.rootSchema as SchemaBuildingMap[string]); + this.finalized = true; + } + + return this.rootSchema; + } +} diff --git a/src/stats.ts b/src/stats.ts index a52fecf..a533ebb 100644 --- a/src/stats.ts +++ b/src/stats.ts @@ -1,4 +1,4 @@ -import type { Schema, ArraySchemaType, SchemaField, SchemaType } from './stream'; +import type { Schema, ArraySchemaType, SchemaField, SchemaType } from './schema-analyzer'; function widthRecursive(schema?: Schema) { let width = 0; diff --git a/src/stream.ts b/src/stream.ts index 016a2e1..572a63d 100644 --- a/src/stream.ts +++ b/src/stream.ts @@ -1,415 +1,35 @@ -import Reservoir from 'reservoir'; import { Duplex } from 'stream'; -import _ from 'lodash'; + import type { - Document, - ObjectId, - MinKey, - MaxKey, - Long, - Double, - Int32, - Decimal128, - Binary, - BSONRegExp, - Code, - BSONSymbol, - Timestamp + Document } from 'bson'; -import semanticTypeRegisters from './semantic-types'; - -type BaseSchemaType = { - path: string; - count: number; - probability: number; - has_duplicates: boolean; - unique: number; -} - -type ConstantSchemaType = BaseSchemaType & { - name: 'Null' | 'Undefined'; -} - -type TypeCastMap = { - Array: unknown[]; - Binary: Binary; - Boolean: boolean; - Code: Code; - Date: Date; - Decimal128: Decimal128; - Double: Double; - Int32: Int32; - Int64: Long; - MaxKey: MaxKey; - MinKey: MinKey; - Null: null; - Object: Record; - ObjectId: ObjectId; - BSONRegExp: BSONRegExp; - String: string; - BSONSymbol: BSONSymbol; - Timestamp: Timestamp; - Undefined: undefined; -}; - -type TypeCastTypes = keyof TypeCastMap; -type BSONValue = TypeCastMap[TypeCastTypes]; - -export type PrimitiveSchemaType = BaseSchemaType & { - name: 'String' | 'Number' | 'Int32' | 'Boolean' | 'Decimal128' | 'Long' | 'ObjectId' | 'Date' | 'RegExp' | 'Symbol' | 'MaxKey' | 'MinKey' | 'Binary' | 'Code' | 'Timestamp' | 'DBRef'; - values: BSONValue[]; -} - -export type ArraySchemaType = BaseSchemaType & { - name: 'Array'; - lengths: number[]; - average_length: number; - total_count: number; - // eslint-disable-next-line no-use-before-define - types: SchemaType[]; -} - -export type DocumentSchemaType = BaseSchemaType & { - name: 'Document'; - // eslint-disable-next-line no-use-before-define - fields: SchemaField[]; -} - -export type SchemaType = ConstantSchemaType | PrimitiveSchemaType | ArraySchemaType | DocumentSchemaType; - -export type SchemaField = { - name: string; - count: number; - path: string; - type: string | string[]; - probability: number; - has_duplicates: boolean; - types: SchemaType[]; -}; +import { SchemaAnalyzer } from './schema-analyzer'; +import type { SchemaParseOptions } from './schema-analyzer'; -export type Schema = { - count: number; - fields: SchemaField[] -} - -// Used when building the end Schema. -type SchemaBuildingMap = { - [typeName: string]: { - name: string; - path: string; - count: number; - lengths?: number[]; - average_length?: number; - total_count?: number; - types?: SchemaBuildingMap; - - fields?: SchemaBuildingMap; - - values?: { - // Reservoir type. - pushSome: (value: any) => void; - }; +export class ParseStream extends Duplex { + analyzer: SchemaAnalyzer; + constructor(options?: SchemaParseOptions) { + super({ objectMode: true }); + this.analyzer = new SchemaAnalyzer(options); } -} -/** -* Extracts a Javascript string from a BSON type to compute unique values. -*/ -function extractStringValueFromBSON(value: any): string { - if (value && value._bsontype) { - if (['Decimal128', 'Long'].includes(value._bsontype)) { - return value.toString(); - } - if (['Double', 'Int32'].includes(value._bsontype)) { - return String(value.value); - } - } - if (typeof value === 'string') { - return value; + _write(obj: Document, enc: unknown, cb: () => void) { + this.analyzer.analyzeDoc(obj); + this.emit('progress', obj); + cb(); } - return String(value); -} -function fieldComparator(a: SchemaField, b: SchemaField) { - // Make sure _id is always at top, even in presence of uppercase fields. - const aName = a.name; - const bName = b.name; - if (aName === '_id') { - return -1; - } - if (bName === '_id') { - return 1; - } - // Otherwise sort case-insensitively. - return aName.toLowerCase() < bName.toLowerCase() ? -1 : 1; -} - -/** - * Final pass through the result to add missing information: - * - compute `probability`, `unique`, `has_duplicates` and - * `average_length` fields - * - add `Undefined` pseudo-types - * - collapse `type` arrays to single string if length 1 - * - turns fields and types objects into arrays to conform with original - * schema parser - * This mutates the passed in schema. - */ -function finalizeSchema(schema: any, parent?: any, tag?: 'fields' | 'types'): void { - if (schema === undefined) { - return; - } + _read() {} - if (tag === undefined) { - // Recursively finalize fields. - // debug('recursively calling schema.fields'); - finalizeSchema(schema.fields, schema, 'fields'); - } - if (tag === 'fields') { - Object.values(schema).forEach((field: any) => { - // Create `Undefined` pseudo-type. - const missing = parent.count - field.count; - if (missing > 0) { - field.types.Undefined = { - name: 'Undefined', - type: 'Undefined', - path: field.path, - count: missing - }; - } - field.total_count = Object.values(field.types) - .map((v: any) => v.count) - .reduce((p, c) => p + c, 0); - - // Recursively finalize types. - finalizeSchema(field.types, field, 'types'); - field.type = field.types.map((v: SchemaField) => v.name); - if (field.type.length === 1) { - field.type = field.type[0]; - } - // A field has duplicates when any of its types have duplicates. - field.has_duplicates = !!field.types.find((v: SchemaField) => v.has_duplicates); - // Compute probability. - field.probability = field.count / parent.count; - }); - // turn object into array - parent.fields = Object.values(parent.fields as SchemaField[]).sort(fieldComparator); - } - if (tag === 'types') { - Object.values(schema).forEach((type: any) => { - type.total_count = (type.lengths || []).reduce((p: number, c: number) => p + c || 0, 0); - // debug('recursively calling schema.fields'); - finalizeSchema(type.fields, type, 'fields'); - // debug('recursively calling schema.types'); - finalizeSchema(type.types, type, 'types'); - // compute `probability` for each type - type.probability = type.count / (parent.total_count || parent.count); - // compute `unique` and `has_duplicates` for each type - if (type.name === 'Null' || type.name === 'Undefined') { - delete type.values; - type.unique = type.count === 0 ? 0 : 1; - type.has_duplicates = type.count > 1; - } else if (type.values) { - type.unique = new Set(type.values.map(extractStringValueFromBSON)).size; - type.has_duplicates = type.unique !== type.values.length; - } - // compute `average_length` for array types - if (type.lengths) { - type.average_length = type.total_count / type.lengths.length; - } - // recursively finalize fields and types - }); - parent.types = Object.values(parent.types as SchemaType[]).sort((a, b) => b.probability - a.probability); + _final(cb: () => void) { + this.push(this.analyzer.getResult()); + this.push(null); + cb(); } } -type SemanticTypeFunction = ((value: string, path?: string) => boolean); -type SemanticTypeMap = { - [typeName: string]: SemanticTypeFunction | boolean; -}; - -export type SchemaParseOptions = { - semanticTypes?: boolean | SemanticTypeMap; - storeValues?: boolean; -}; - -/** - * Main entry point for schema parsing. - */ -function parse(options?: SchemaParseOptions) { - // Set default options. - options = { semanticTypes: false, storeValues: true, ...options }; - - let semanticTypes: SemanticTypeMap = { - ...semanticTypeRegisters - }; - - if (typeof options.semanticTypes === 'object') { - // enable existing types that evaluate to true - const enabledTypes = Object.entries(options.semanticTypes) - .filter(([, v]) => typeof v === 'boolean' && v) - .map(([k]) => k.toLowerCase()); - - semanticTypes = { - ...Object.entries(semanticTypes) - .filter(([k]) => enabledTypes.includes(k.toLowerCase())) - .reduce((p, [k, v]) => ({ ...p, [k]: v }), {}) - }; - - Object.entries(options.semanticTypes) - .filter(([, v]) => typeof v === 'function') - .forEach(([k, v]) => { semanticTypes[k] = v; }); - } - - const rootSchema: { - fields: SchemaBuildingMap; - count: number; - } = { - fields: {}, - count: 0 - }; - - let finalized = false; - - /** - * Returns the type of value as a string. BSON type aware. Replaces `Object` - * with `Document` to avoid naming conflicts with javascript Objects. - */ - function getBSONType(value: any): string { - let T; - if (value && value._bsontype) { - T = value._bsontype; - } else { - T = Object.prototype.toString.call(value).replace(/^\[object (\w+)\]$/, '$1'); - } - if (T === 'Object') { - T = 'Document'; - } - return T; - } - - function getSemanticType(value: any, path: string) { - // Pass value to semantic type detectors, return first match or undefined. - - const returnValue = Object.entries(semanticTypes) - .filter(([, v]) => { - return (v as SemanticTypeFunction)(value, path); - }) - .map(([k]) => k)[0]; - return returnValue; - } - - /** - * Handles adding the value to the value reservoir. Will also crop - * strings at 10,000 characters. - * - * @param {Object} type the type object from `addToType` - * @param {Any} value the value to be added to `type.values` - */ - function addToValue(type: SchemaBuildingMap[string], value: any) { - if (type.name === 'String') { - // Crop strings at 10k characters, - if (value.length > 10000) { - value = value.charCodeAt(10000 - 1) === value.codePointAt(10000 - 1) - ? value.slice(0, 10000) - : value.slice(0, 10000 - 1); - } - } - type.values!.pushSome(value); - } - - /** - * Takes a field value, determines the correct type, handles recursion into - * nested arrays and documents, and passes the value down to `addToValue`. - * - * @param {String} path field path in dot notation - * @param {Any} value value of the field - * @param {Object} schema the updated schema object - */ - - function addToType(path: string, value: any, schema: SchemaBuildingMap) { - const bsonType = getBSONType(value); - // If semantic type detection is enabled, the type is the semantic type - // or the original bson type if no semantic type was detected. If disabled, - // it is always the bson type. - const typeName = (options?.semanticTypes) ? getSemanticType(value, path) || bsonType : bsonType; - const type = schema[typeName] = _.get(schema, typeName, { - name: typeName, - bsonType: bsonType, - path: path, - count: 0 - } as SchemaBuildingMap[string]); - type.count++; - // Recurse into arrays by calling `addToType` for each element. - if (typeName === 'Array') { - type.types = type.types ?? {}; - type.lengths = type.lengths || []; - type.lengths.push(value.length); - value.forEach((v: SchemaBuildingMap) => addToType(path, v, type.types!)); - - // Recurse into nested documents by calling `addToField` for all sub-fields. - } else if (typeName === 'Document') { - type.fields = _.get(type, 'fields', {}); - Object.entries(value).forEach(([k, v]) => addToField(`${path}.${k}`, v, type.fields!)); - - // If the `storeValues` option is enabled, store some example values. - } else if (options?.storeValues) { - const defaultValue = bsonType === 'String' - ? Reservoir(100) : Reservoir(10000); - type.values = type.values || defaultValue; - - addToValue(type, value); - } - } - - /** - * Handles a field from a document. Passes the value to `addToType`. - * - * @param {String} path field path in dot notation - * @param {Any} value value of the field - * @param {Object} schema the updated schema object - */ - function addToField(path: string, value: any, schema: SchemaBuildingMap) { - const pathSplitOnDot = path.split('.'); - const defaults = { - [path]: { - name: pathSplitOnDot[pathSplitOnDot.length - 1], - path: path, - count: 0, - types: {} - } - }; - _.defaultsDeep(schema, defaults); - const field = schema[path]; - - field.count++; - // debug('added to field', field); - addToType(path, value, field.types!); - } - - function cleanup() { - if (!finalized) { - finalizeSchema(rootSchema as SchemaBuildingMap[string]); - finalized = true; - } - } - - return new Duplex({ - objectMode: true, - write(obj: Document, enc: unknown, cb: () => void) { - for (const key of Object.keys(obj)) { addToField(key, obj[key], rootSchema.fields); } - rootSchema.count += 1; - this.emit('progress', obj); - cb(); - }, - read() {}, - final(cb: () => void) { - cleanup(); - this.push(rootSchema); - this.push(null); - cb(); - } - }); +// for backwards compatibility +export default function makeParseStream(options?: SchemaParseOptions) { + return new ParseStream(options); } - -export default parse; diff --git a/test/array-object-types.test.ts b/test/array-object-types.test.ts index ce74579..bfd9e14 100644 --- a/test/array-object-types.test.ts +++ b/test/array-object-types.test.ts @@ -1,6 +1,6 @@ import getSchema from '../src'; import assert from 'assert'; -import type { ArraySchemaType, Schema, SchemaField, SchemaType } from '../src/stream'; +import type { ArraySchemaType, Schema, SchemaField, SchemaType } from '../src/schema-analyzer'; describe('arrays and objects as type (INT-203 restructuring)', function() { const docs = [ diff --git a/test/basic-embedded-array.test.ts b/test/basic-embedded-array.test.ts index eefd158..f8a7ede 100644 --- a/test/basic-embedded-array.test.ts +++ b/test/basic-embedded-array.test.ts @@ -2,7 +2,7 @@ import assert from 'assert'; import BSON from 'bson'; import getSchema from '../src'; -import type { ArraySchemaType } from '../src/stream'; +import type { ArraySchemaType } from '../src/schema-analyzer'; describe('basic embedded array', function() { let followingIds: ArraySchemaType; diff --git a/test/basic-embedded-documents.test.ts b/test/basic-embedded-documents.test.ts index f24062e..0f6d0f5 100644 --- a/test/basic-embedded-documents.test.ts +++ b/test/basic-embedded-documents.test.ts @@ -2,7 +2,7 @@ import assert from 'assert'; import BSON from 'bson'; import getSchema from '../src'; -import type { Schema, DocumentSchemaType, SchemaField } from '../src/stream'; +import type { Schema, DocumentSchemaType, SchemaField } from '../src/schema-analyzer'; describe('basic embedded documents', function() { const docs = [ diff --git a/test/basic-probability.test.ts b/test/basic-probability.test.ts index 9e9e426..e92cbcd 100644 --- a/test/basic-probability.test.ts +++ b/test/basic-probability.test.ts @@ -1,7 +1,7 @@ import assert from 'assert'; import getSchema from '../src'; -import type { Schema } from '../src/stream'; +import type { Schema } from '../src/schema-analyzer'; describe('simple probability', function() { const docs = [ diff --git a/test/basic-unique.test.ts b/test/basic-unique.test.ts index e4e8a17..34393aa 100644 --- a/test/basic-unique.test.ts +++ b/test/basic-unique.test.ts @@ -2,7 +2,7 @@ import assert from 'assert'; import bson from 'bson'; import getSchema from '../src'; -import type { Schema, PrimitiveSchemaType } from '../src/stream'; +import type { Schema, PrimitiveSchemaType } from '../src/schema-analyzer'; describe('has_duplicates', function() { const docs: { diff --git a/test/basic.test.ts b/test/basic.test.ts index e86df00..6f01c5d 100644 --- a/test/basic.test.ts +++ b/test/basic.test.ts @@ -1,7 +1,7 @@ import assert from 'assert'; import BSON from 'bson'; -import type { Schema } from '../src/stream'; +import type { Schema } from '../src/schema-analyzer'; import getSchema from '../src'; describe('using only basic fields', function() { diff --git a/test/mixed-type-evolving-schema.test.ts b/test/mixed-type-evolving-schema.test.ts index 6a56c6d..3dce102 100644 --- a/test/mixed-type-evolving-schema.test.ts +++ b/test/mixed-type-evolving-schema.test.ts @@ -1,7 +1,7 @@ import assert from 'assert'; import BSON from 'bson'; -import type { SchemaField } from '../src/stream'; +import type { SchemaField } from '../src/schema-analyzer'; import getSchema from '../src'; describe('evolving schema', function() { diff --git a/test/mixed-type-nested.test.ts b/test/mixed-type-nested.test.ts index 66abdc5..e52551c 100644 --- a/test/mixed-type-nested.test.ts +++ b/test/mixed-type-nested.test.ts @@ -1,7 +1,7 @@ import assert from 'assert'; import getSchema from '../src'; -import type { DocumentSchemaType, Schema, SchemaField } from '../src/stream'; +import type { DocumentSchemaType, Schema, SchemaField } from '../src/schema-analyzer'; describe('mixed types nested', function() { const docs = [ diff --git a/test/mixed-type-order.test.ts b/test/mixed-type-order.test.ts index 63b58cd..b6e3eb8 100644 --- a/test/mixed-type-order.test.ts +++ b/test/mixed-type-order.test.ts @@ -1,6 +1,6 @@ import assert from 'assert'; -import type { SchemaField } from '../src/stream'; +import type { SchemaField } from '../src/schema-analyzer'; import getSchema from '../src'; describe('mixed type order', function() { diff --git a/test/mixed-type-probability.test.ts b/test/mixed-type-probability.test.ts index c105f80..208ee47 100644 --- a/test/mixed-type-probability.test.ts +++ b/test/mixed-type-probability.test.ts @@ -1,7 +1,7 @@ import assert from 'assert'; import getSchema from '../src'; -import type { Schema, SchemaField } from '../src/stream'; +import type { Schema, SchemaField } from '../src/schema-analyzer'; describe('mixed type probability', function() { const docs = [ diff --git a/test/nested-document-path.test.ts b/test/nested-document-path.test.ts index 429e31e..faef7d4 100644 --- a/test/nested-document-path.test.ts +++ b/test/nested-document-path.test.ts @@ -1,7 +1,7 @@ import assert from 'assert'; import getSchema from '../src'; -import type { Schema, DocumentSchemaType } from '../src/stream'; +import type { Schema, DocumentSchemaType } from '../src/schema-analyzer'; describe('nested document path', function() { let schema: Schema; diff --git a/test/promise.test.ts b/test/promise.test.ts index 0b7d3c6..44c27f4 100644 --- a/test/promise.test.ts +++ b/test/promise.test.ts @@ -1,6 +1,6 @@ import assert from 'assert'; -import type { Schema } from '../src/stream'; +import type { Schema } from '../src/schema-analyzer'; import getSchema from '../src'; describe('getSchema should return promise', function() { diff --git a/test/regression-strings-have-same-probability.test.ts b/test/regression-strings-have-same-probability.test.ts index 714efe2..de96d39 100644 --- a/test/regression-strings-have-same-probability.test.ts +++ b/test/regression-strings-have-same-probability.test.ts @@ -1,7 +1,7 @@ import assert from 'assert'; import getSchema from '../src'; -import type { Schema, PrimitiveSchemaType } from '../src/stream'; +import type { Schema, PrimitiveSchemaType } from '../src/schema-analyzer'; describe('regression', function() { describe('strings have same probability', function() { diff --git a/test/semantic-types.test.ts b/test/semantic-types.test.ts index 4734f0f..8195f45 100644 --- a/test/semantic-types.test.ts +++ b/test/semantic-types.test.ts @@ -2,7 +2,7 @@ import assert from 'assert'; import BSON from 'bson'; import getSchema from '../src'; -import type { Schema, PrimitiveSchemaType } from '../src/stream'; +import type { Schema, PrimitiveSchemaType } from '../src/schema-analyzer'; describe('options', function() { const docs = [ diff --git a/test/stats.test.ts b/test/stats.test.ts index 2adfa07..7493d82 100644 --- a/test/stats.test.ts +++ b/test/stats.test.ts @@ -2,7 +2,7 @@ import assert from 'assert'; import getSchema from '../src'; import stats from '../src/stats'; -import type { Schema } from '../src/stream'; +import type { Schema } from '../src/schema-analyzer'; describe('schema statistics', function() { describe('empty doc', function() { diff --git a/test/stream.test.ts b/test/stream.test.ts index 9276de7..12e1082 100644 --- a/test/stream.test.ts +++ b/test/stream.test.ts @@ -2,7 +2,7 @@ import assert from 'assert'; import { Readable } from 'stream'; import nativeParser from '../src/stream'; -import type { Schema } from '../src/stream'; +import type { Schema } from '../src/schema-analyzer'; const fixture = [ {