Skip to content

Commit

Permalink
v1.165.0
Browse files Browse the repository at this point in the history
  • Loading branch information
varovaro committed May 6, 2024
2 parents 6e84aaa + 577102c commit f4597ac
Show file tree
Hide file tree
Showing 116 changed files with 4,377 additions and 2,002 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/ci_e2e_cypress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
image: docker.elastic.co/elasticsearch/elasticsearch:7.17.6-amd64
ports:
- 9200/tcp
options: -e="discovery.type=single-node" --health-cmd="curl http://localhost:9200/_cluster/health" --health-interval=10s --health-timeout=5s --health-retries=10
options: -e="discovery.type=single-node" --health-cmd="curl http://localhost:9200/_cluster/health" --health-interval=10s --health-timeout=30s --health-retries=10
redis:
image: redis:5.0.14
ports:
Expand Down Expand Up @@ -94,6 +94,7 @@ jobs:
DATABASE_NAME: uwazi_e2e
- name: start Uwazi
env:
NOTIFICATION_DELAY: 50
DBHOST: localhost:27017
ELASTICSEARCH_URL: http://localhost:${{ job.services.elasticsearch.ports[9200] }}
DATABASE_NAME: uwazi_e2e
Expand All @@ -102,7 +103,7 @@ jobs:
run: yarn run-production > output.txt &
- name: wait for uwazi to be ready
run: sleep 5 && wget --waitretry=5 --retry-connrefused -v http://localhost:3000/
timeout-minutes: 2
timeout-minutes: 3
- name: Cypress run
uses: cypress-io/github-action@v6
with:
Expand Down
193 changes: 126 additions & 67 deletions app/api/services/informationextraction/InformationExtraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,42 +20,82 @@ import templatesModel from 'api/templates/templates';
import request from 'shared/JSONRequest';
import languages from 'shared/languages';
import { EntitySchema } from 'shared/types/entityType';
import { ObjectIdSchema, PropertySchema } from 'shared/types/commonTypes';
import { ExtractedMetadataSchema, ObjectIdSchema, PropertySchema } from 'shared/types/commonTypes';
import { ModelStatus } from 'shared/types/IXModelSchema';
import { IXSuggestionType } from 'shared/types/suggestionType';
import { FileType } from 'shared/types/fileType';
import {
FileWithAggregation,
getFilesForTraining,
getFilesForSuggestions,
propertyTypeIsSelectOrMultiSelect,
} from 'api/services/informationextraction/getFiles';
import { Suggestions } from 'api/suggestions/suggestions';
import { IXExtractorType } from 'shared/types/extractorType';
import { IXModelType } from 'shared/types/IXModelType';
import { stringToTypeOfProperty } from 'shared/stringToTypeOfProperty';
import { ParagraphSchema } from 'shared/types/segmentationType';
import ixmodels from './ixmodels';
import { IXModelsModel } from './IXModelsModel';
import { Extractors } from './ixextractors';
import {
CommonSuggestion,
RawSuggestion,
TextSelectionSuggestion,
ValuesSelectionSuggestion,
formatSuggestion,
} from './suggestionFormatting';

type RawSuggestion = {
tenant: string;
const defaultTrainingLanguage = 'en';

type TaskTypes = 'suggestions' | 'create_model';

interface TaskParameters {
id: string;
}

type ResultParameters = TaskParameters;

interface InternalResultParameters {
id: ObjectId;
}

type IXResultsMessage = ResultsMessage<TaskTypes, ResultParameters>;

type InternalIXResultsMessage = ResultsMessage<TaskTypes, InternalResultParameters>;

interface CommonMaterialsData {
xml_file_name: string;
text: string;
segment_text: string;
segments_boxes: {
top: number;
left: number;
width: number;
height: number;
page_number: number;
}[];
};
id: string;
tenant: string;
xml_segments_boxes?: ParagraphSchema[];
page_width?: number;
page_height?: number;
}

interface LabeledMaterialsData extends CommonMaterialsData {
language_iso: string;
}

interface TextSelectionMaterialsData extends LabeledMaterialsData {
label_text: FileWithAggregation['propertyValue'];
label_segments_boxes:
| (Omit<ParagraphSchema, 'page_number'> & { page_number?: string })[]
| undefined;
}

interface ValuesSelectionMaterialsData extends LabeledMaterialsData {
values: { id: string; label: string }[];
}

type MaterialsData =
| CommonMaterialsData
| TextSelectionMaterialsData
| ValuesSelectionMaterialsData;

class InformationExtraction {
static SERVICE_NAME = 'information_extraction';

public taskManager: TaskManager;
public taskManager: TaskManager<TaskTypes, TaskParameters, ResultParameters>;

static mock: any;

Expand All @@ -70,7 +110,7 @@ class InformationExtraction {
this.taskManager.subscribeToResults();
}

requestResults = async (message: ResultsMessage) => {
requestResults = async (message: InternalIXResultsMessage) => {
const response = await request.get(message.data_url);

return JSON.parse(response.json);
Expand All @@ -88,6 +128,43 @@ class InformationExtraction {
return request.uploadFile(url, xmlName, fileContent);
};

extendMaterialsWithLabeledData = (
propertyLabeledData: ExtractedMetadataSchema | undefined,
propertyValue: FileWithAggregation['propertyValue'],
propertyType: FileWithAggregation['propertyType'],
file: FileWithAggregation,
_data: CommonMaterialsData
): MaterialsData => {
const language_iso = languages.get(file.language!, 'ISO639_1') || defaultTrainingLanguage;

let data: MaterialsData = { ..._data, language_iso };

const isSelect = propertyTypeIsSelectOrMultiSelect(propertyType);

if (!isSelect && propertyLabeledData) {
data = {
...data,
label_text: propertyValue || propertyLabeledData?.selection?.text,
label_segments_boxes: propertyLabeledData.selection?.selectionRectangles?.map(r => {
const { page, ...rectangle } = r;
return { ...rectangle, page_number: page };
}),
};
}

if (isSelect) {
if (!Array.isArray(propertyValue)) {
throw new Error('Property value should be an array');
}
data = {
...data,
values: propertyValue.map(({ value, label }) => ({ id: value, label })),
};
}

return data;
};

sendMaterials = async (
files: FileWithAggregation[],
extractor: IXExtractorType,
Expand All @@ -102,14 +179,17 @@ class InformationExtraction {
const propertyLabeledData = file.extractedMetadata?.find(
labeledData => labeledData.name === extractor.property
);
const { propertyValue, propertyType } = file;

if (!xmlExists || (type === 'labeled_data' && !propertyLabeledData)) {
return;
}
const missingData = propertyTypeIsSelectOrMultiSelect(propertyType)
? !propertyValue
: type === 'labeled_data' && !propertyLabeledData;

if (!xmlExists || missingData) return;

await InformationExtraction.sendXmlToService(serviceUrl, xmlName, extractor._id, type);

let data: any = {
let data: MaterialsData = {
xml_file_name: xmlName,
id: extractor._id.toString(),
tenant: tenants.current().name,
Expand All @@ -118,17 +198,14 @@ class InformationExtraction {
page_height: file.segmentation.segmentation?.page_height,
};

if (type === 'labeled_data' && propertyLabeledData) {
const defaultTrainingLanguage = 'en';
data = {
...data,
language_iso: languages.get(file.language!, 'ISO639_1') || defaultTrainingLanguage,
label_text: file.propertyValue || propertyLabeledData.selection?.text,
label_segments_boxes: propertyLabeledData.selection?.selectionRectangles?.map(r => {
const { page, ...selection } = r;
return { ...selection, page_number: page };
}),
};
if (type === 'labeled_data' && !missingData) {
data = this.extendMaterialsWithLabeledData(
propertyLabeledData,
propertyValue,
propertyType,
file,
data
);
}
await request.post(urljoin(serviceUrl, type), data);
if (type === 'prediction_data') {
Expand Down Expand Up @@ -171,7 +248,7 @@ class InformationExtraction {
return this._getEntityFromFile(file);
};

saveSuggestions = async (message: ResultsMessage) => {
saveSuggestions = async (message: InternalIXResultsMessage) => {
const templates = await templatesModel.get();
const rawSuggestions: RawSuggestion[] = await this.requestResults(message);
const [extractor] = await Extractors.get({ _id: message.params?.id });
Expand All @@ -197,46 +274,20 @@ class InformationExtraction {
fileId: segmentation.fileID,
});

let status: 'ready' | 'failed' = 'ready';
let error = '';

const allProps: PropertySchema[] = _.flatMap(
templates,
template => template.properties || []
);
const property = allProps.find(p => p.name === extractor.property);

const suggestedValue = stringToTypeOfProperty(
rawSuggestion.text,
property?.type,
currentSuggestion?.language || entity.language
const suggestion = await formatSuggestion(
property,
rawSuggestion,
currentSuggestion,
entity,
message
);

if (suggestedValue === null) {
status = 'failed';
error = 'Invalid value for property type';
}

if (!message.success) {
status = 'failed';
error = message.error_message ? message.error_message : 'Unknown error';
}

const suggestion: IXSuggestionType = {
...currentSuggestion,
suggestedValue,
...(property?.type === 'date' ? { suggestedText: rawSuggestion.text } : {}),
segment: rawSuggestion.segment_text,
status,
error,
date: new Date().getTime(),
selectionRectangles: rawSuggestion.segments_boxes.map((box: any) => {
const rect = { ...box, page: box.page_number.toString() };
delete rect.page_number;
return rect;
}),
};

return Suggestions.save(suggestion);
})
);
Expand Down Expand Up @@ -398,9 +449,9 @@ class InformationExtraction {
});
};

processResults = async (_message: ResultsMessage): Promise<void> => {
processResults = async (_message: IXResultsMessage): Promise<void> => {
await tenants.run(async () => {
const message = {
const message: InternalIXResultsMessage = {
..._message,
params: { ..._message.params, id: new ObjectId(_message.params!.id) },
};
Expand Down Expand Up @@ -439,7 +490,7 @@ class InformationExtraction {
};
};

updateSuggestionStatus = async (message: ResultsMessage, currentModel: IXModelType) => {
updateSuggestionStatus = async (message: InternalIXResultsMessage, currentModel: IXModelType) => {
const suggestionsStatus = await this.getSuggestionsStatus(
message.params!.id,
currentModel.creationDate
Expand All @@ -456,3 +507,11 @@ class InformationExtraction {
}

export { InformationExtraction };
export type {
IXResultsMessage,
InternalIXResultsMessage,
CommonSuggestion,
TextSelectionSuggestion,
ValuesSelectionSuggestion,
RawSuggestion,
};
Loading

0 comments on commit f4597ac

Please sign in to comment.