diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index ef4117e93c..0d646ed93d 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -323,8 +323,12 @@ public String getTeiId() { return "fig_" + this.id; } + public boolean isCompleteForTEI() { + return (StringUtils.isNotBlank(header) || StringUtils.isNotBlank(caption) || CollectionUtils.isNotEmpty(graphicObjects)); + } + public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { - if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption) && CollectionUtils.isEmpty(graphicObjects)) { + if (!isCompleteForTEI()) { return null; } Element figureElement = XmlBuilderUtils.teiElement("figure"); @@ -568,4 +572,5 @@ public void setLabel(StringBuilder label) { public void setUri(URI uri) { this.uri = uri; } + } diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 14d468418c..1016760284 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -30,7 +30,6 @@ import nu.xom.Attribute; import nu.xom.Element; import nu.xom.Node; -import nu.xom.Text; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; @@ -43,6 +42,7 @@ public class Table extends Figure { private List contentTokens = new ArrayList<>(); private List fullDescriptionTokens = new ArrayList<>(); + private boolean goodTable = true; private StringBuilder note = null; @@ -62,9 +62,13 @@ public Table() { note = new StringBuilder(); } + public boolean isCompleteForTEI() { + return (StringUtils.isNotEmpty(header) && StringUtils.isNotEmpty(caption)); + } + @Override public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { - if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption)) { + if (!isCompleteForTEI()) { return null; } @@ -104,7 +108,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form addXmlId(desc, "_" + divID); } - if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) { + if (StringUtils.isNotBlank(labeledCaption)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens); List clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { @@ -169,7 +173,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } Element noteNode = null; - if (note != null && note.toString().trim().length()>0) { + if (StringUtils.isNotBlank(note)) { noteNode = XmlBuilderUtils.teiElement("note"); if (config.isGenerateTeiIds()) { @@ -177,7 +181,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form addXmlId(noteNode, "_" + divID); } - if ( (labeledNote != null) && (labeledNote.length() > 0) ) { + if (StringUtils.isNotBlank(labeledNote)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens); List clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { @@ -346,9 +350,14 @@ public String getLabeledNote() { return this.labeledNote; } - private boolean validateTable() { + /** Check if the table: + * - has label, header and content + * - header starts with "tab" + * - label can be parsed + */ + public boolean validateTable() { CntManager cnt = Engine.getCntManager(); - if (StringUtils.isEmpty(label) || StringUtils.isEmpty(header) || StringUtils.isEmpty(content)) { + if (StringUtils.isAnyBlank(label, header, content)) { cnt.i(TableRejectionCounters.EMPTY_LABEL_OR_HEADER_OR_CONTENT); return false; } @@ -359,7 +368,8 @@ private boolean validateTable() { cnt.i(TableRejectionCounters.CANNOT_PARSE_LABEL_TO_INT); return false; } - if (!getHeader().toLowerCase().startsWith("table")) { + // tab covers: table, tabelle, tableu, tabella, etc. + if (!StringUtils.startsWithIgnoreCase(getHeader(), "tab")) { cnt.i(TableRejectionCounters.HEADER_NOT_STARTS_WITH_TABLE_WORD); return false; } @@ -423,4 +433,5 @@ public boolean isGoodTable() { public String getTeiId() { return "tab_" + this.id; } + } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index d7dc90b08b..3925470a45 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -874,6 +874,7 @@ public static List getConnectedGraphics(Block block, Document doc public void postProcessTables() { for (Table table : tables) { if (!table.firstCheck()) { + table.setGoodTable(false); continue; } @@ -919,7 +920,7 @@ public void postProcessTables() { table.getContentTokens().clear(); table.getContentTokens().addAll(contentResult); - table.secondCheck(); + table.setGoodTable(table.secondCheck()); } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index b102b20795..c94aca004c 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -11,7 +11,6 @@ import java.nio.charset.StandardCharsets; -import org.apache.lucene.util.CollectionUtil; import org.grobid.core.GrobidModels; import org.grobid.core.data.*; import org.grobid.core.document.Document; @@ -33,21 +32,16 @@ import org.grobid.core.features.FeaturesVectorFulltext; import org.grobid.core.lang.Language; import org.grobid.core.lexicon.Lexicon; -import org.grobid.core.lexicon.Lexicon.OrganizationRecord; import org.grobid.core.layout.*; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.LanguageUtilities; -import org.grobid.core.utilities.TextUtilities; -import org.grobid.core.utilities.KeyGen; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.Consolidation; +import org.grobid.core.utilities.*; import org.grobid.core.utilities.matching.ReferenceMarkerMatcher; import org.grobid.core.utilities.matching.EntityMatcherException; import org.grobid.core.engines.citations.CalloutAnalyzer; import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,10 +60,13 @@ import java.util.StringTokenizer; import java.util.TreeSet; import java.util.regex.Matcher; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import nu.xom.Element; import static org.apache.commons.lang3.StringUtils.*; +import static org.grobid.core.utilities.LabelUtils.postProcessFullTextLabeledText; public class FullTextParser extends AbstractParser { private static final Logger LOGGER = LoggerFactory.getLogger(FullTextParser.class); @@ -252,23 +249,25 @@ else if (config.getConsolidateCitations() == 2) // full text processing featSeg = getBodyTextFeatured(doc, documentBodyParts); - String resultBody = null; - LayoutTokenization layoutTokenization = null; + String bodyResults = null; + LayoutTokenization bodyLayoutTokens = null; List
figures = null; List tables = null; List equations = null; if (featSeg != null && isNotBlank(featSeg.getLeft())) { - // if featSeg is null, it usually means that no body segment is found in the + // if featSeg is null, it usually means that the fulltext body is not found in the // document segmentation - String bodytext = featSeg.getLeft(); - layoutTokenization = featSeg.getRight(); + String bodyText = featSeg.getLeft(); + bodyLayoutTokens = featSeg.getRight(); //tokenizationsBody = featSeg.getB().getTokenization(); //layoutTokensBody = featSeg.getB().getLayoutTokens(); - resultBody = label(bodytext); + bodyResults = label(bodyText); + //Correct subsequent I-
or I-
+ bodyResults = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResults); // we apply now the figure and table models based on the fulltext labeled output - figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc); + figures = processFigures(bodyResults, bodyLayoutTokens.getTokenization(), doc); // further parse the caption for(Figure figure : figures) { if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) { @@ -277,8 +276,46 @@ else if (config.getConsolidateCitations() == 2) figure.setCaptionLayoutTokens(captionProcess.getRight()); } } - - tables = processTables(resultBody, layoutTokenization.getTokenization(), doc); + + long numberFiguresFulltextModel = Arrays.stream(bodyResults.split("\n")) + .filter(r -> r.endsWith("I-" + TaggingLabels.FIGURE_LABEL)) + .count(); + + List
badFigures = figures.stream() + .filter(f -> !f.isCompleteForTEI()) + .collect(Collectors.toList()); + + LOGGER.info("Number of figures badly formatted or incomplete we identified: " + badFigures.size()); + bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL, + !(figures.size() > numberFiguresFulltextModel)); + + figures = figures.stream() + .filter(f -> !badFigures.contains(f)) + .collect(Collectors.toList()); + + tables = processTables(bodyResults, bodyLayoutTokens.getTokenization(), doc); + + long numberTablesFulltextModel = Arrays.stream(bodyResults.split("\n")) + .filter(r -> r.endsWith("I-" + TaggingLabels.TABLE_LABEL)) + .count(); + + //We deal with tables considered bad by reverting them as , to reduce the risk them to be + // dropped later on. + + //TODO: double check the way the tables are validated + + List
badTables = tables.stream() + .filter(t -> !(t.isCompleteForTEI() && t.validateTable())) + .collect(Collectors.toList()); + + LOGGER.info("Number of tables badly formatted or incomplete we identified: " + badTables.size()); + bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL, + !(tables.size() > numberTablesFulltextModel)); + + tables = tables.stream() + .filter(t-> !badTables.contains(t)) + .collect(Collectors.toList()); + // further parse the caption for(Table table : tables) { if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) { @@ -293,7 +330,7 @@ else if (config.getConsolidateCitations() == 2) } } - equations = processEquations(resultBody, layoutTokenization.getTokenization(), doc); + equations = processEquations(bodyResults, bodyLayoutTokens.getTokenization(), doc); } else { LOGGER.debug("Fulltext model: The featured body is empty"); } @@ -301,30 +338,35 @@ else if (config.getConsolidateCitations() == 2) // possible annexes (view as a piece of full text similar to the body) documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX); featSeg = getBodyTextFeatured(doc, documentBodyParts); - String resultAnnex = null; - List tokenizationsBody2 = null; + String annexResults = null; + List annexTokens = null; if (featSeg != null && isNotEmpty(trim(featSeg.getLeft()))) { - // if featSeg is null, it usually means that no body segment is found in the + // if featSeg is null, it usually means that no annex segment is found in the // document segmentation - String bodytext = featSeg.getLeft(); - tokenizationsBody2 = featSeg.getRight().getTokenization(); - resultAnnex = label(bodytext); - //System.out.println(rese); + String annexFeatures = featSeg.getLeft(); + annexTokens = featSeg.getRight().getTokenization(); + annexResults = label(annexFeatures); +// System.out.println(annexResults); } // post-process reference and footnote callout to keep them consistent (e.g. for example avoid that a footnote // callout in superscript is by error labeled as a numerical reference callout) List markerTypes = null; - if (resultBody != null) - markerTypes = postProcessCallout(resultBody, layoutTokenization); + if (bodyResults != null) + markerTypes = postProcessCallout(bodyResults, bodyLayoutTokens); // final combination toTEI(doc, // document - resultBody, resultAnnex, // labeled data for body and annex - layoutTokenization, tokenizationsBody2, // tokenization for body and annex + bodyResults, + annexResults, // labeled data for body and annex + bodyLayoutTokens, + annexTokens, // tokenization for body and annex resHeader, // header - figures, tables, equations, markerTypes, + figures, + tables, + equations, + markerTypes, config); return doc; } catch (GrobidException e) { @@ -334,6 +376,129 @@ else if (config.getConsolidateCitations() == 2) } } + static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel) { + return revertResultsForBadItems(badFiguresOrTables, resultBody, itemLabel, true); + } + + static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel, boolean strict) { + //LF: we update the resultBody sequence by reverting these tables as elements + if (CollectionUtils.isNotEmpty(badFiguresOrTables)) { + List> labelledResultsAsList = Arrays.stream(resultBody.split("\n")) + .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) + .collect(Collectors.toList()); + + for (Figure badItem : badFiguresOrTables) { + // Find the index of the first layoutToken of the table in the tokenization + List layoutTokenItem = badItem.getLayoutTokens(); + List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, + itemLabel, strict); + if (candidateIndexes.isEmpty()) { + LOGGER.info("Cannot find the candidate index for fixing the tables."); + continue; + } + + // At this point i have more than one candidate, which can be matched if the same first + // token is repeated in the sequence. The next step is to find the matching figure/table + // using a large sequence + + List sequenceTokenItemWithoutSpaces = layoutTokenItem.stream() + .map(LayoutToken::getText) + .map(StringUtils::strip) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList()); + + //TODO: reduce candidate indexes after matching one sequence + int resultIndexCandidate = consolidateResultCandidateThroughSequence(candidateIndexes, labelledResultsAsList, sequenceTokenItemWithoutSpaces); + + if (resultIndexCandidate > -1) { + boolean first = true; + for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + sequenceTokenItemWithoutSpaces.size(), labelledResultsAsList.size()); i++) { + List line = labelledResultsAsList.get(i); + String label = Iterables.getLast(line); + if (first) { + first = false; + } else { + if (label.startsWith("I-")) { + break; + } + } + line.set(line.size() - 1, label.replace(TaggingLabels.TABLE_LABEL, TaggingLabels.PARAGRAPH_LABEL)); + } + } else { + LOGGER.warn("Cannot find the result index candidate."); + } + } + + String updatedResultBody = labelledResultsAsList.stream() + .map(l -> String.join("\t", l)) + .collect(Collectors.joining("\n")); + + resultBody = updatedResultBody; + } + return resultBody; + } + + static int consolidateResultCandidateThroughSequence(List candidateIndexes, List> splitResult, List tokensNoSpaceItem) { + int resultIndexCandidate = -1; + if (candidateIndexes.size() == 1){ + resultIndexCandidate = candidateIndexes.get(0); + } else { + for (int candidateIndex: candidateIndexes) { + List candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpaceItem.size(), splitResult.size())) + .stream() + .map(i -> i.get(0)) + .collect(Collectors.toList()); + + String candidateTableText = String.join("", candidateTable); + String tokensText = String.join("", tokensNoSpaceItem); + + if (candidateTableText.equals(tokensText)) { + resultIndexCandidate = candidateIndex; + break; + } + } + } + return resultIndexCandidate; + } + + /** + * Find a set of candidates representing the indexes from the labelledResults which could correspond + * to the first token of the figure/table + * + * strict = True then it will check the items related to I-
or I-
first + * and then the
or
only if there are not candidates + * strict = False is usually necessary if there are more tables than I- token, this because a figure/table could be + * identified within the sequence initially provided by the fulltext model + * + */ + @NotNull + static List findCandidateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel) { + return findCandidateIndex(layoutTokenItem, labelledResultsAsList, itemLabel, true); + } + + @NotNull + static List findCandidateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel, boolean strict) { + LayoutToken firstLayoutTokenItem = layoutTokenItem.get(0); + + List candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) + .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) + && Iterables.getLast(labelledResultsAsList.get(i)).equals("I-" + itemLabel)) + .boxed() + .collect(Collectors.toList()); + + if (candidateIndexes.isEmpty() || !strict) { + candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) + .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) + && ( + Iterables.getLast(labelledResultsAsList.get(i)).equals(itemLabel) + || Iterables.getLast(labelledResultsAsList.get(i)).equals("I-" + itemLabel)) + ) + .boxed() + .collect(Collectors.toList()); + } + return candidateIndexes; + } + /** * Machine-learning recognition of full text structures limted to header and funding information. @@ -518,44 +683,6 @@ public Pair> processShort(List tokens, Do return Pair.of(res, layoutTokenization); } - /** - * Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure) - * It converts table and figure labels to paragraph labels. - */ - protected static String postProcessFullTextLabeledText(String fulltextLabeledText) { - if (fulltextLabeledText == null) - return null; - StringBuilder result = new StringBuilder(); - - String[] lines = fulltextLabeledText.split("\n"); - String previousLabel = null; - for(int i=0; i getBodyTextFeatured(Document doc, SortedSet documentBodyParts) { if ((documentBodyParts == null) || (documentBodyParts.size() == 0)) { @@ -1978,17 +2105,17 @@ private static boolean testClosingTag(StringBuilder buffer, buffer.append("
\n\n"); } else if (lastTag0.equals("")) { buffer.append("\n\n"); - } else if (lastTag0.equals("") || - lastTag0.equals("") || - lastTag0.equals("") || + } else if (lastTag0.equals("") || + lastTag0.equals("") || + lastTag0.equals("") || lastTag0.equals("")) { buffer.append(""); // Make sure that paragraph is closed when markers are at the end of it - if (!currentTag0.equals("") && - (!currentTag0.equals("") || - !currentTag0.equals("") || - !currentTag0.equals("") || + if (!currentTag0.equals("") && + (!currentTag0.equals("") || + !currentTag0.equals("") || + !currentTag0.equals("") || !currentTag0.equals("") ) ) { @@ -2152,10 +2279,12 @@ protected Pair processTrainingDataFigures(String rese, // If there still an open figure if (openFigure) { - while((tokenizationsFigure.size() > 0) && + while(CollectionUtils.isNotEmpty(tokenizationsFigure) && (tokenizationsFigure.get(0).getText().equals("\n") || - tokenizationsFigure.get(0).getText().equals(" ")) ) + tokenizationsFigure.get(0).getText().equals(" ")) + ) { tokenizationsFigure.remove(0); + } // process the "accumulated" figure Pair trainingData = parsers.getFigureParser() @@ -2195,9 +2324,9 @@ protected List
processTables(String rese, for (Table result : localResults) { List localTokenizationTable = result.getLayoutTokens(); - //result.setLayoutTokens(tokenizationTable); +// result.setRawLayoutTokens(tokenizationTable); - // block setting: we restrict to the tokenization of this particulart table + // block setting: we restrict to the tokenization of this particular table SortedSet blockPtrs = new TreeSet<>(); for (LayoutToken lt : localTokenizationTable) { if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { @@ -2421,7 +2550,7 @@ protected List processEquations(String rese, } /** - * Ensure consistent use of callouts in the entire document body + * Ensure consistent use of callouts in the entire document body */ private List postProcessCallout(String result, LayoutTokenization layoutTokenization) { if (layoutTokenization == null) @@ -2481,7 +2610,7 @@ private List postProcessCallout(String result, LayoutTokenization la if (figureMarkerSeen.contains(refText)) { // already seen reference marker sequence, we skip it continue; - } + } MarkerType localMarkerType = CalloutAnalyzer.getCalloutType(refTokens); if (figureMarkerTypeCounts.get(localMarkerType) == null) figureMarkerTypeCounts.put(localMarkerType, 1); @@ -2494,7 +2623,7 @@ private List postProcessCallout(String result, LayoutTokenization la if (tableMarkerSeen.contains(refText)) { // already seen reference marker sequence, we skip it continue; - } + } MarkerType localMarkerType = CalloutAnalyzer.getCalloutType(refTokens); if (tableMarkerTypeCounts.get(localMarkerType) == null) tableMarkerTypeCounts.put(localMarkerType, 1); @@ -2507,16 +2636,16 @@ private List postProcessCallout(String result, LayoutTokenization la if (equationMarkerSeen.contains(refText)) { // already seen reference marker sequence, we skip it continue; - } + } MarkerType localMarkerType = CalloutAnalyzer.getCalloutType(refTokens); if (equationMarkerTypeCounts.get(localMarkerType) == null) equationMarkerTypeCounts.put(localMarkerType, 1); else - equationMarkerTypeCounts.put(localMarkerType, equationMarkerTypeCounts.get(localMarkerType)+1); + equationMarkerTypeCounts.put(localMarkerType, equationMarkerTypeCounts.get(localMarkerType)+1); if (!equationMarkerSeen.contains(refText)) - equationMarkerSeen.add(refText); - } + equationMarkerSeen.add(refText); + } } } @@ -2550,8 +2679,8 @@ private static MarkerType getBestType(Map markerTypeCount) { * and body sections. */ private void toTEI(Document doc, - String reseBody, - String reseAnnex, + String bodyLabellingResult, + String annexLabellingResult, LayoutTokenization layoutTokenization, List tokenizationsAnnex, BiblioItem resHeader, @@ -2577,13 +2706,13 @@ private void toTEI(Document doc, teiFormatter, resCitations, config); if (acknowledgmentStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(acknowledgmentStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { - String local_tei = localResult.getLeft().toXML(); - local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); - annexStatements.add(local_tei); + String localTei = localResult.getLeft().toXML(); + localTei = localTei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + annexStatements.add(localTei); } else { annexStatements.add(acknowledgmentStmt.toString()); @@ -2592,14 +2721,14 @@ private void toTEI(Document doc, if (localResult != null && localResult.getRight() != null) { if (localResult.getRight().getLeft() != null) { List localFundings = localResult.getRight().getLeft(); - if (localFundings.size()>0) { + if (CollectionUtils.isNotEmpty(localFundings)) { fundings.addAll(localFundings); } } if (localResult.getRight().getRight() != null) { List localAffiliations = localResult.getRight().getRight(); - if (localAffiliations.size()>0) { + if (CollectionUtils.isNotEmpty(localAffiliations)) { affiliations.addAll(localAffiliations); } } @@ -2620,14 +2749,14 @@ private void toTEI(Document doc, resCitations, config); } - if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + if (StringUtils.isNotBlank(fundingStmt)) { + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { - String local_tei = localResult.getLeft().toXML(); - local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); - annexStatements.add(local_tei); + String localTEI = localResult.getLeft().toXML(); + localTEI = localTEI.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + annexStatements.add(localTEI); } else { annexStatements.add(fundingStmt.toString()); } @@ -2635,14 +2764,14 @@ private void toTEI(Document doc, if (localResult != null && localResult.getRight() != null) { if (localResult.getRight().getLeft() != null) { List localFundings = localResult.getRight().getLeft(); - if (localFundings.size()>0) { + if (CollectionUtils.isNotEmpty(localFundings)) { fundings.addAll(localFundings); } } if (localResult.getRight().getRight() != null) { List localAffiliations = localResult.getRight().getRight(); - if (localAffiliations.size()>0) { + if (CollectionUtils.isNotEmpty(localAffiliations)) { affiliations.addAll(localAffiliations); } } @@ -2659,13 +2788,13 @@ private void toTEI(Document doc, resCitations, config); if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null){ - String local_tei = localResult.getLeft().toXML(); - local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); - annexStatements.add(local_tei); + String localTEI = localResult.getLeft().toXML(); + localTEI = localTEI.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + annexStatements.add(localTEI); } else { annexStatements.add(fundingStmt.toString()); } @@ -2673,14 +2802,14 @@ private void toTEI(Document doc, if (localResult != null && localResult.getRight() != null) { if (localResult.getRight().getLeft() != null) { List localFundings = localResult.getRight().getLeft(); - if (localFundings.size()>0) { + if (CollectionUtils.isNotEmpty(localFundings)) { fundings.addAll(localFundings); } } if (localResult.getRight().getRight() != null) { List localAffiliations = localResult.getRight().getRight(); - if (localAffiliations.size()>0) { + if (CollectionUtils.isNotEmpty(localAffiliations)) { affiliations.addAll(localAffiliations); } } @@ -2689,7 +2818,7 @@ private void toTEI(Document doc, tei.append(teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, fundings, config)); - tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations, + tei = teiFormatter.toTEIBody(tei, bodyLabellingResult, resHeader, resCitations, layoutTokenization, figures, tables, equations, markerTypes, doc, config); tei.append("\t\t\n"); @@ -2709,15 +2838,15 @@ private void toTEI(Document doc, } if (affiliations != null && affiliations.size() >0) { - + // check if we have at least one acknowledged research infrastructure here List filteredInfrastructures = new ArrayList<>(); for(Affiliation affiliation : affiliations) { - if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) + if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) filteredInfrastructures.add(affiliation); else if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0) { // check if this organization is a known infrastructure - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); if (localOrganizationNamings != null && localOrganizationNamings.size()>0) { filteredInfrastructures.add(affiliation); @@ -2729,7 +2858,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio if (filteredInfrastructures.size() > 0) { tei.append("\n\t\t\t\n"); for(Affiliation affiliation : filteredInfrastructures) { - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); tei.append("\t\t\t\t"); tei.append("\t\t\t\t\t"); @@ -2749,7 +2878,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio } tei.append("\t\t\t\t\n"); } - + tei.append("\t\t\t\n"); } } @@ -2761,10 +2890,10 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio Pair> headerAvailabilityProcessed = processShort(headerAvailabilityStatementTokens, doc); if (headerAvailabilityProcessed != null) { availabilityStmt = teiFormatter.processTEIDivSection("availability", - "\t\t\t", - headerAvailabilityProcessed.getLeft(), - headerAvailabilityProcessed.getRight(), - resCitations, + "\t\t\t", + headerAvailabilityProcessed.getLeft(), + headerAvailabilityProcessed.getRight(), + resCitations, config); } if (availabilityStmt.length() > 0) { @@ -2774,17 +2903,17 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio // availability statements in non-header part availabilityStmt = getSectionAsTEI("availability", - "\t\t\t", - doc, - SegmentationLabels.AVAILABILITY, - teiFormatter, - resCitations, + "\t\t\t", + doc, + SegmentationLabels.AVAILABILITY, + teiFormatter, + resCitations, config); if (availabilityStmt.length() > 0) { tei.append(availabilityStmt.toString()); } - tei = teiFormatter.toTEIAnnex(tei, reseAnnex, resHeader, resCitations, + tei = teiFormatter.toTEIAnnex(tei, annexLabellingResult, resHeader, resCitations, tokenizationsAnnex, markerTypes, doc, config); tei = teiFormatter.toTEIReferences(tei, resCitations, config); @@ -2829,7 +2958,7 @@ private void toTEIHeaderFunding(Document doc, teiFormatter, null, config); if (acknowledgmentStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(acknowledgmentStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { @@ -2873,7 +3002,7 @@ private void toTEIHeaderFunding(Document doc, config); } if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { @@ -2911,7 +3040,7 @@ private void toTEIHeaderFunding(Document doc, null, config); if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null){ @@ -2957,15 +3086,15 @@ private void toTEIHeaderFunding(Document doc, } if (affiliations != null && affiliations.size() >0) { - + // check if we have at least one acknowledged research infrastructure here List filteredInfrastructures = new ArrayList<>(); for(Affiliation affiliation : affiliations) { - if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) + if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) filteredInfrastructures.add(affiliation); else if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0) { // check if this organization is a known infrastructure - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); if (localOrganizationNamings != null && localOrganizationNamings.size()>0) { filteredInfrastructures.add(affiliation); @@ -2977,7 +3106,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio if (filteredInfrastructures.size() > 0) { tei.append("\n\t\t\t\n"); for(Affiliation affiliation : filteredInfrastructures) { - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); tei.append("\t\t\t\t"); tei.append("\t\t\t\t\t"); @@ -2997,7 +3126,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio } tei.append("\t\t\t\t\n"); } - + tei.append("\t\t\t\n"); } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 185f3714d5..2c72ac1600 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -1,5 +1,6 @@ package org.grobid.core.engines; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -108,9 +109,15 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, String res = null; if (StringUtils.isNotBlank(header)) { res = label(header); + + if (GrobidProperties.getGrobidEngineName("header").equals("delft")) { + res = LabelUtils.postProcessFulltextCorrectSequencesWithoutInitialToken(res); + } resHeader = resultExtraction(res, headerTokenization, resHeader); } + + // language identification StringBuilder contentSample = new StringBuilder(); if (resHeader.getTitle() != null) { @@ -261,7 +268,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, //resHeader.setKeyword(keywords.replace("\n", " ").replace(" ", " ")); resHeader.setKeyword(keywords); List keywordsSegmented = BiblioItem.segmentKeywords(keywords); - if ((keywordsSegmented != null) && (keywordsSegmented.size() > 0)) + if (CollectionUtils.isNotEmpty(keywordsSegmented)) resHeader.setKeywords(keywordsSegmented); } @@ -311,7 +318,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, } // copyrights/license identification - if (resHeader.getCopyright() != null && resHeader.getCopyright().length()>0) { + if (StringUtils.isNotBlank(resHeader.getCopyright())) { if (GrobidProperties.getGrobidEngineName("copyright").equals("delft")) { CopyrightsLicense copyrightsLicense = LicenseClassifier.getInstance().classify(resHeader.getCopyright()); if (copyrightsLicense != null) @@ -928,6 +935,7 @@ else if (biblio.getPublicationDate() == null) // this will need to be reviewed with more training data, for the moment // avoid concatenation for abstracts as it brings more noise than correct pieces //biblio.setAbstract(biblio.getAbstract() + " " + clusterContent); + //TODO: avoid dumping text on the floor } else { biblio.setAbstract(clusterContent); List tokens = cluster.concatTokens(); diff --git a/grobid-core/src/main/kotlin/org/grobid/core/utilities/LabelUtils.kt b/grobid-core/src/main/kotlin/org/grobid/core/utilities/LabelUtils.kt new file mode 100644 index 0000000000..eecc6d88d9 --- /dev/null +++ b/grobid-core/src/main/kotlin/org/grobid/core/utilities/LabelUtils.kt @@ -0,0 +1,132 @@ +package org.grobid.core.utilities + +import org.apache.commons.lang3.StringUtils +import org.grobid.core.engines.label.TaggingLabels + +object LabelUtils { + /** + * Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure) + * It converts table and figure labels to paragraph labels. + */ + @JvmStatic + fun postProcessFullTextLabeledText(fulltextLabeledText: String): String { + val result = StringBuilder() + + val lines = fulltextLabeledText + .split("\n".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray() + var previousLabel: String? = null + + for (i in lines.indices) { + val line = lines[i] + if (StringUtils.isBlank(line)) continue + + val pieces = line.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + val label = pieces[pieces.size - 1] + if (label == "I-" + TaggingLabels.FIGURE.label || label == "I-" + TaggingLabels.TABLE.label) { + if (previousLabel == null || !previousLabel.endsWith(TaggingLabels.PARAGRAPH.label)) { + pieces[pieces.size - 1] = "I-" + TaggingLabels.PARAGRAPH.label + } else { + pieces[pieces.size - 1] = TaggingLabels.PARAGRAPH.label + } + } else if (label == TaggingLabels.FIGURE.label || label == TaggingLabels.TABLE.label) { + pieces[pieces.size - 1] = TaggingLabels.PARAGRAPH.label + } + result.append(pieces.joinToString("\t")) + previousLabel = label + result.append("\n") + } + + return result.toString() + } + + /** + * This method correct the fulltext sequence when the model has predicted several unlikely + * start sequences of table or figures. + * For example: I-
followed by another I-
(or table)
+ **/ + @JvmStatic + fun postProcessFulltextFixInvalidTableOrFigure(fulltextLabeledText: String): String { + val result = StringBuilder() + + val lines = fulltextLabeledText + .split("\n".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray() + + var previousLabel: String? = null + for (i in lines.indices) { + val line = lines[i] + if (StringUtils.isBlank(line)) continue + + val pieces = line + .split("\t".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray() + + val label = pieces[pieces.size - 1] + if (label == "I-" + TaggingLabels.FIGURE.label) { + if (StringUtils.equals(previousLabel, "I-" + TaggingLabels.FIGURE.label)) { + pieces[pieces.size - 1] = TaggingLabels.FIGURE.label + } + } else if (label == "I-" + TaggingLabels.TABLE.label) { + if (StringUtils.equals(previousLabel, "I-" + TaggingLabels.TABLE.label)) { + pieces[pieces.size - 1] = TaggingLabels.TABLE.label + } + } + + result.append(pieces.joinToString("\t")) + previousLabel = label + result.append("\n") + } + + return result.toString() + } + + /** + * PostProcess the sequence of labels by reverting change of labels that are not starting with a initial sequence. + * For example, in a sequence of I- followed by , + * we revert all sequence of availability as + */ + @JvmStatic + fun postProcessFulltextCorrectSequencesWithoutInitialToken(fulltextLabeledText: String): String { + val result = StringBuilder() + + val lines = fulltextLabeledText + .split("\n".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray() + + var previousLabel: String? = null + for (i in lines.indices) { + val line = lines[i] + if (StringUtils.isBlank(line)) continue + val pieces = line + .split("\t".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray() + var label = pieces[pieces.size - 1] + + if (!label.equals(TaggingLabels.OTHER_LABEL)) { + if (!label.startsWith("I-")) { + if (previousLabel != null && previousLabel != label) { + pieces[pieces.size - 1] = previousLabel + label = previousLabel + } + } + } + + result.append(pieces.joinToString("\t")) + if (label == TaggingLabels.OTHER_LABEL) { + previousLabel = null + } else { + previousLabel = label.replace("I-", "") + } + result.append("\n") + } + + return result.toString() + } + +} diff --git a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java deleted file mode 100644 index a4fb60a4e9..0000000000 --- a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java +++ /dev/null @@ -1,263 +0,0 @@ -package org.grobid.core.engines; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.Pair; -import org.grobid.core.analyzers.GrobidAnalyzer; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.layout.LayoutToken; -import org.grobid.core.main.LibraryLoader; -import org.grobid.core.utilities.GrobidProperties; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.collection.IsCollectionWithSize.hasSize; - -public class FullTextParserTest { - - private FullTextParser target; - - @Before - public void setUp() throws Exception { - target = new FullTextParser(new EngineParsers()); - } - - @BeforeClass - public static void init() { - LibraryLoader.load(); - GrobidProperties.getInstance(); - } - - @AfterClass - public static void tearDown() { - GrobidFactory.reset(); - } - - @Test - public void testProcessTrainingDataFigures_single_figure() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataFigures(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - - assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(13)); - - } - - @Test - public void testProcessTrainingDataFigures_multiple_figures() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataFigures(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - List output = new ArrayList<>(); - for (String block : tokenisation.split("\n\n\n")) { - String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - if (StringUtils.isNotBlank(collect)) { - output.add(collect); - } - } - - assertThat(output, hasSize(2)); - assertThat(output.get(0), is("FIG . 1 . λ ( T )")); - assertThat(output.get(1), is("vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(15)); - - } - - @Test - public void testProcessTrainingDataTables_single_table() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - - assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(13)); - - } - - @Test - public void testProcessTrainingDataTable_multiple_tables() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - List output = new ArrayList<>(); - for (String block : tokenisation.split("\n\n\n")) { - String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - if (StringUtils.isNotBlank(collect)) { - output.add(collect); - } - } - - assertThat(output, hasSize(2)); - assertThat(output.get(0), is("FIG . 1 . λ ( T )")); - assertThat(output.get(1), is("vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(15)); - - } - - @Test - public void testPostProcessLabeledAbstract_shouldTransformTableLabelInParagraphLabel() { - String resultWithTables = "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\tI-
\n" + - "study\tstudy\ts\tst\tstu\tstud\ty\tdy\tudy\ttudy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "South\tsouth\tS\tSo\tSou\tSout\th\tth\tuth\touth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Asian\tasian\tA\tAs\tAsi\tAsia\tn\tan\tian\tsian\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Clinical\tclinical\tC\tCl\tCli\tClin\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Toxicology\ttoxicology\tT\tTo\tTox\tToxi\ty\tgy\togy\tlogy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + - "Collaboration\tcollaboration\tC\tCo\tCol\tColl\tn\ton\tion\ttion\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + - ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "which\twhich\tw\twh\twhi\twhic\th\tch\tich\thich\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "is\tis\ti\tis\tis\tis\ts\tis\tis\tis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "funded\tfunded\tf\tfu\tfun\tfund\td\ted\tded\tnded\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "Wellcome\twellcome\tW\tWe\tWel\tWell\te\tme\tome\tcome\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "Trust\ttrust\tT\tTr\tTru\tTrus\tt\tst\tust\trust\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "National\tnational\tN\tNa\tNat\tNati\tl\tal\tnal\tonal\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "Health\thealth\tH\tHe\tHea\tHeal\th\tth\tlth\talth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Medical\tmedical\tM\tMe\tMed\tMedi\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Council\tcouncil\tC\tCo\tCou\tCoun\tl\til\tcil\tncil\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "International\tinternational\tI\tIn\tInt\tInte\tl\tal\tnal\tonal\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "Collaborative\tcollaborative\tC\tCo\tCol\tColl\te\tve\tive\ttive\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + - "Grant\tgrant\tG\tGr\tGra\tGran\tt\tnt\tant\trant\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + - "GR071669MA\tgr071669ma\tG\tGR\tGR0\tGR07\tA\tMA\t9MA\t69MA\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tCONTAINSDIGITS\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "funding\tfunding\tf\tfu\tfun\tfund\tg\tng\ting\tding\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "bodies\tbodies\tb\tbo\tbod\tbodi\ts\tes\ties\tdies\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "had\thad\th\tha\thad\thad\td\tad\thad\thad\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "no\tno\tn\tno\tno\tno\to\tno\tno\tno\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "role\trole\tr\tro\trol\trole\te\tle\tole\trole\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "analyzing\tanalyzing\ta\tan\tana\tanal\tg\tng\ting\tzing\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "interpreting\tinterpreting\ti\tin\tint\tinte\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "data\tdata\td\tda\tdat\tdata\ta\tta\tata\tdata\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "writing\twriting\tw\twr\twri\twrit\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "article\tarticle\ta\tar\tart\tarti\te\tle\tcle\ticle\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t11\t10\t0\tNUMBER\t0\t0\t
"; - String postprocessed = FullTextParser.postProcessFullTextLabeledText(resultWithTables); - - assertThat(Arrays.stream(StringUtils.split(postprocessed, "\n")) - .filter(l -> l.endsWith("
")) - .count(), is(0L)); - - assertThat(Arrays.stream(StringUtils.split(postprocessed, "\n")) - .filter(l -> l.endsWith("")) - .count(), is (Arrays.stream(StringUtils.split(resultWithTables, "\n")) - .filter(l -> l.endsWith("
")) - .count())); - - } - - -} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java b/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java index f25b263049..9b7db0c685 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java @@ -10,6 +10,10 @@ public class GrobidTestUtils { + public static String getWapitiResult(List features, List> labels) { + return getWapitiResult(features, labels, " "); + } + /** * Utility method to generate a hypotetical result from wapiti. * Useful for testing the extraction of the sequence labeling. @@ -17,7 +21,7 @@ public class GrobidTestUtils { * @param labels label maps. A list of Triples, containing label (left), start_index (middle) and end_index exclusive (right) * @return a string containing the resulting features + labels returned by wapiti */ - public static String getWapitiResult(List features, List> labels) { + public static String getWapitiResult(List features, List> labels, String separator) { List labeled = new ArrayList<>(); int idx = 0; @@ -52,7 +56,7 @@ public static String getWapitiResult(List features, List\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataFigures(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val reconstructedText = + Arrays.stream(tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + + MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(13) + ) + } + + @Test + @Throws(Exception::class) + fun testProcessTrainingDataFigures_multiple_figures() { + val text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO" + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text) + val rese = + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataFigures(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val output: MutableList = ArrayList() + for (block in tokenisation.split("\n\n\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) { + val collect = Arrays.stream(block.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + if (StringUtils.isNotBlank(collect)) { + output.add(collect) + } + } + + MatcherAssert.assertThat>(output, IsCollectionWithSize.hasSize(2)) + MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) + MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(15) + ) + } + + @Test + @Throws(Exception::class) + fun testProcessTrainingDataTables_single_table() { + val text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO" + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text) + val rese = + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataTables(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val reconstructedText = + Arrays.stream(tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + + MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(13) + ) + } + + @Test + @Throws(Exception::class) + fun testProcessTrainingDataTable_multiple_tables() { + val text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO" + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text) + val rese = + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataTables(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val output: MutableList = ArrayList() + for (block in tokenisation.split("\n\n\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) { + val collect = Arrays.stream(block.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + if (StringUtils.isNotBlank(collect)) { + output.add(collect) + } + } + + MatcherAssert.assertThat>(output, IsCollectionWithSize.hasSize(2)) + MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) + MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(15) + ) + } + + @Test + fun testFindCandidates_shouldFindMultipleResults() { + // I need to prepare a sequence where there might be multiple matches, + // and then verify that the sequence is correctly used for discrimination + var sequence = "This article solves the problem where some of our interaction are fauly. " + + "a 8 9 j 92j 3 3j 9 j 9j Table 1: The reconstruction of the national anthem " + + "We are interested in the relation between certain information and " + + "a b b d 1 2 3 4 s 3 3 d9 Table 2: The relation between information and noise " + + "the related affectionality. " + + "a b b d 1 2 3 4 5 6 7 Table 3: The relation between homicides and donuts eating " + + "The relation between homicides and donuts eating is a very important one. " + + var tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(sequence) + + // These triples made in following way: label, starting index (included), ending index (excluded) + val labels = listOf( + Triple.of("I-", 0, 1), + Triple.of("", 1, 24), + Triple.of("I-
", 25, 26), + Triple.of("
", 26, 61), + Triple.of("I-", 62, 63), + Triple.of("", 63, 81), + Triple.of("I-
", 82, 83), + Triple.of("
", 82, 118), + Triple.of("I-", 119, 120), + Triple.of("", 120, 129), + Triple.of("I-
", 130, 131), + Triple.of("
", 131, 171), + Triple.of("I-", 171, 172), + Triple.of("", 172, 195), + ) + + val features = tokens.stream().map { it.text }.collect(Collectors.toList()) + val wapitiResults = GrobidTestUtils.getWapitiResult(features, labels, "\t") + + val wapitiResultsAsList = + Arrays.stream(wapitiResults.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList()) + + val table1Tokens = tokens.subList(25, 61) + val foundCandidateIndex = FullTextParser.findCandidateIndex(table1Tokens, wapitiResultsAsList, TABLE_LABEL) + + assertThat(foundCandidateIndex, hasSize(3)) + assertThat(foundCandidateIndex.get(0), `is`(13)) + assertThat(foundCandidateIndex.get(1), `is`(42)) + assertThat(foundCandidateIndex.get(2), `is`(67)) + } + + @Test + fun testConsolidateResultCandidateThroughSequence() { + // var mockDocumentSource = createMock(DocumentSource::class.java) + // var document = Document.createFromText("") + val sequence = "This article solves the problem where some of our interaction are fauly. " + + "a 8 9 j 92j 3 3j 9 j 9j Table 1: The reconstruction of the national anthem " + + "We are interested in the relation between certain information and " + + "a b b d 1 2 3 4 s 3 3 d9 Table 2: The relation between information and noise " + + "the related affectionality. " + + "a b b d 1 2 3 4 5 6 7 Table 3: The relation between homicides and donuts eating " + + "The relation between homicides and donuts eating is a very important one. " + + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(sequence) + + // These triples made in following way: label, starting index (included), ending index (excluded) + val labels = listOf( + Triple.of("I-", 0, 1), + Triple.of("", 1, 24), + Triple.of("I-
", 25, 26), + Triple.of("
", 26, 61), + Triple.of("I-", 62, 63), + Triple.of("", 63, 81), + Triple.of("I-
", 82, 83), + Triple.of("
", 82, 118), + Triple.of("I-", 119, 120), + Triple.of("", 120, 129), + Triple.of("I-
", 130, 131), + Triple.of("
", 131, 171), + Triple.of("I-", 171, 172), + Triple.of("", 172, 195), + ) + + val features = tokens.stream().map { it.text }.collect(Collectors.toList()) + + val wapitiResults = GrobidTestUtils.getWapitiResult(features, labels, "\t") + val wapitiResultsAsList = + Arrays.stream(wapitiResults.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList()) + + val table1Tokens = tokens.subList(25, 61) + + val sequenceTokenWithoutSpacesTable1: List = table1Tokens.stream() + .map { obj: LayoutToken -> obj.text } + .map { str: String? -> StringUtils.strip(str) } + .filter { cs: String? -> StringUtils.isNotBlank(cs) } + .collect(Collectors.toList()) + + val candidatesIndexes = Arrays.asList( + 13, 42, 67 + ) + val consolidatedTable1ResultCandidateThroughSequence = FullTextParser.consolidateResultCandidateThroughSequence( + candidatesIndexes, + wapitiResultsAsList, + sequenceTokenWithoutSpacesTable1 + ) + + assertThat(consolidatedTable1ResultCandidateThroughSequence, `is`(13)) + + val table2Tokens = tokens.subList(82, 118) + + var sequenceTokenWithoutSpacesTable2: MutableList? = table2Tokens.stream() + .map { obj: LayoutToken -> obj.text } + .map { str: String? -> StringUtils.strip(str) } + .filter { cs: String? -> StringUtils.isNotBlank(cs) } + .collect(Collectors.toList()) + + val consolidatedTable2ResultCandidateThroughSequence = FullTextParser.consolidateResultCandidateThroughSequence( + candidatesIndexes, + wapitiResultsAsList, + sequenceTokenWithoutSpacesTable2 + ) + + assertThat(consolidatedTable2ResultCandidateThroughSequence, `is`(42)) + + val table3Tokens = tokens.subList(130, 171) + + var sequenceTokenWithoutSpacesTable3: MutableList? = table3Tokens.stream() + .map { obj: LayoutToken -> obj.text } + .map { str: String? -> StringUtils.strip(str) } + .filter { cs: String? -> StringUtils.isNotBlank(cs) } + .collect(Collectors.toList()) + + val consolidatedTable3ResultCandidateThroughSequence = FullTextParser.consolidateResultCandidateThroughSequence( + candidatesIndexes, + wapitiResultsAsList, + sequenceTokenWithoutSpacesTable3 + ) + + assertThat(consolidatedTable3ResultCandidateThroughSequence, `is`(67)) + } +} \ No newline at end of file diff --git a/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt new file mode 100644 index 0000000000..0cb3f6a731 --- /dev/null +++ b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt @@ -0,0 +1,470 @@ +package org.grobid.core.utilities + +import org.apache.commons.lang3.StringUtils +import org.grobid.core.utilities.GrobidConfig.ModelParameters +import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.CoreMatchers.not +import org.hamcrest.MatcherAssert.assertThat +import org.junit.jupiter.api.BeforeAll +import java.util.* +import java.util.stream.Collectors +import kotlin.test.Test + + +class LabelUtilsTest { + + + @Test + fun testPostProcessLabeledAbstract_shouldTransformTableLabelInParagraphLabel() { + val resultWithTables = + "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\tI-
\n" + + "study\tstudy\ts\tst\tstu\tstud\ty\tdy\tudy\ttudy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "South\tsouth\tS\tSo\tSou\tSout\th\tth\tuth\touth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Asian\tasian\tA\tAs\tAsi\tAsia\tn\tan\tian\tsian\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Clinical\tclinical\tC\tCl\tCli\tClin\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Toxicology\ttoxicology\tT\tTo\tTox\tToxi\ty\tgy\togy\tlogy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + + "Collaboration\tcollaboration\tC\tCo\tCol\tColl\tn\ton\tion\ttion\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "which\twhich\tw\twh\twhi\twhic\th\tch\tich\thich\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "is\tis\ti\tis\tis\tis\ts\tis\tis\tis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "funded\tfunded\tf\tfu\tfun\tfund\td\ted\tded\tnded\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "Wellcome\twellcome\tW\tWe\tWel\tWell\te\tme\tome\tcome\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "Trust\ttrust\tT\tTr\tTru\tTrus\tt\tst\tust\trust\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "National\tnational\tN\tNa\tNat\tNati\tl\tal\tnal\tonal\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "Health\thealth\tH\tHe\tHea\tHeal\th\tth\tlth\talth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "Medical\tmedical\tM\tMe\tMed\tMedi\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "Council\tcouncil\tC\tCo\tCou\tCoun\tl\til\tcil\tncil\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + + "International\tinternational\tI\tIn\tInt\tInte\tl\tal\tnal\tonal\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + + "Collaborative\tcollaborative\tC\tCo\tCol\tColl\te\tve\tive\ttive\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + + "Grant\tgrant\tG\tGr\tGra\tGran\tt\tnt\tant\trant\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + + "GR071669MA\tgr071669ma\tG\tGR\tGR0\tGR07\tA\tMA\t9MA\t69MA\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tCONTAINSDIGITS\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "funding\tfunding\tf\tfu\tfun\tfund\tg\tng\ting\tding\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "bodies\tbodies\tb\tbo\tbod\tbodi\ts\tes\ties\tdies\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "had\thad\th\tha\thad\thad\td\tad\thad\thad\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "no\tno\tn\tno\tno\tno\to\tno\tno\tno\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "role\trole\tr\tro\trol\trole\te\tle\tole\trole\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "analyzing\tanalyzing\ta\tan\tana\tanal\tg\tng\ting\tzing\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "interpreting\tinterpreting\ti\tin\tint\tinte\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "data\tdata\td\tda\tdat\tdata\ta\tta\tata\tdata\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + "writing\twriting\tw\twr\twri\twrit\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + "article\tarticle\ta\tar\tart\tarti\te\tle\tcle\ticle\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t11\t10\t0\tNUMBER\t0\t0\t
" + + val postprocessed = LabelUtils.postProcessFullTextLabeledText(resultWithTables) + + assertThat( + Arrays.stream(StringUtils.split(postprocessed, "\n")) + .filter { l -> l.endsWith("
") } + .count(), `is`(0L) + ) + + assertThat( + Arrays.stream(StringUtils.split(postprocessed, "\n")) + .filter { l -> l.endsWith("") } + .count(), `is`( + Arrays.stream(StringUtils.split(resultWithTables, "\n")) + .filter { l -> l.endsWith("
") } + .count()) + ) + } + +// fun testAdjustInvalidSequenceOfStartLabels() { +// val inputStream = javaClass.getResourceAsStream("bodyResults-sample.1.txt") +// val bodyResult = inputStream?.bufferedReader().use { it.readText() } +// +// val postProcessed = LabelUtils.postProcessFullTextLabeledText(bodyResult) +// } + + @Test + fun testPostProcessFulltextFixInvalidTableOrFigure_noChangeNeeded_shouldReturnSameTableOrFigureSequence() { + val bodyResult = + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t11\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKEND\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t11\t0\tNUMBER\t0\t0\t\n" + + "014306\t014306\t0\t01\t014\t0143\t6\t06\t306\t4306\tBLOCKSTART\tLINESTART\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t11\t0\tNUMBER\t0\t0\tI-\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tHYPHEN\t8\t11\t0\tNUMBER\t0\t0\t\n" + + "4\t4\t4\t4\t4\t4\t4\t4\t4\t4\tBLOCKEND\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t11\t0\tNUMBER\t1\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\tI-\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\tI-\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "average\taverage\ta\tav\tave\taver\te\tge\tage\trage\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "distances\tdistances\td\tdi\tdis\tdist\ts\tes\tces\tnces\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "between\tbetween\tb\tbe\tbet\tbetw\tn\ten\teen\tween\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "nucleons\tnucleons\tn\tnu\tnuc\tnucl\ts\tns\tons\teons\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\t\n" + + "Be\tbe\tB\tBe\tBe\tBe\te\tBe\tBe\tBe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\t\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "3\t3\t3\t3\t3\t3\t3\t3\t3\t3\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\tI-\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\tI-\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\t\n" + + "C\tc\tC\tC\tC\tC\tC\tC\tC\tC\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "states\tstates\ts\tst\tsta\tstat\ts\tes\ttes\tates\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "solid\tsolid\ts\tso\tsol\tsoli\td\tid\tlid\tolid\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "lines\tlines\tl\tli\tlin\tline\ts\tes\tnes\tines\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "denote\tdenote\td\tde\tden\tdeno\te\tte\tote\tnote\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "average\taverage\ta\tav\tave\taver\te\tge\tage\trage\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "distances\tdistances\td\tdi\tdis\tdist\ts\tes\tces\tnces\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "r\tr\tr\tr\tr\tr\tr\tr\tr\tr\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tSAMEFONTSIZE\t0\t1\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "N\tn\tN\tN\tN\tN\tN\tN\tN\tN\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t1\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t1\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "N\tn\tN\tN\tN\tN\tN\tN\tN\tN\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t1\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "between\tbetween\tb\tbe\tbet\tbetw\tn\ten\teen\tween\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "two\ttwo\tt\ttw\ttwo\ttwo\to\two\ttwo\ttwo\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "valence\tvalence\tv\tva\tval\tvale\te\tce\tnce\tence\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "nucleons\tnucleons\tn\tnu\tnuc\tnucl\ts\tns\tons\teons\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult) + + assertThat(postProcessed, `is`(bodyResult)) + } + + @Test + fun testPostProcessFulltextFixInvalidTableOrFigure_singleChangeNeeded_shouldCorrectTheTableOrFigureSequence() { + val bodyResult = + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t1\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "state\tstate\ts\tst\tsta\tstat\te\tte\tate\ttate\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\t
\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "panels\tpanels\tp\tpa\tpan\tpane\ts\tls\tels\tnels\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "a\ta\ta\ta\ta\ta\ta\ta\ta\ta\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "are\tare\ta\tar\tare\tare\te\tre\tare\tare\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + + val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult) + + assertThat(postProcessed, not(bodyResult)) + + val splitResult = + Arrays.stream(postProcessed.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList()) + + val countStartingFigure = splitResult.stream() + .map { l: List -> l.last() } + .filter { l: String -> l.equals("I-
") } + .count() + + assertThat(countStartingFigure, `is`(1)) + } + + @Test + fun testPostProcessFulltextFixInvalidTableOrFigure_MultipleChangeNeeded_shouldCorrectTheTableOrFigureSequence() { + val bodyResult = + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t1\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "state\tstate\ts\tst\tsta\tstat\te\tte\tate\ttate\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\t
\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "panels\tpanels\tp\tpa\tpan\tpane\ts\tls\tels\tnels\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "a\ta\ta\ta\ta\ta\ta\ta\ta\ta\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "are\tare\ta\tar\tare\tare\te\tre\tare\tare\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + + val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult) + + assertThat(postProcessed, not(bodyResult)) + + val splitResult = + Arrays.stream(postProcessed.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList>()) + + val countStartingFigure = splitResult.stream() + .map { l: List -> l.last() } + .filter { l: String -> l.equals("I-
") } + .count() + + assertThat(countStartingFigure, `is`(2)) + + val countStartingTables = splitResult.stream() + .map { l: List -> l.last() } + .filter { l: String -> l.equals("I-
") } + .count() + + assertThat(countStartingTables, `is`(1)) + + } + + @Test + fun testPostProcessFulltextCorrectSequencesWithoutInitialToken_shouldChangeAbstractLabelInAvailabilityLabel() { + val resultHeader = "Data\tdata\tD\tDa\tDat\tData\ta\tta\tata\tData\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tSAMEFONTSIZE\t1\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "Availability\tavailability\tA\tAv\tAva\tAvai\ty\tty\tity\tlity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t1\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "Statement\tstatement\tS\tSt\tSta\tStat\tt\tnt\tent\tment\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t1\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ":\t:\t:\t:\t:\t:\t:\t:\t:\t:\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t1\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tPUNCT\t0\t0\t1\t0\t\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\tI-\n" + + "raw\traw\tr\tra\traw\traw\tw\taw\traw\traw\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "sequencing\tsequencing\ts\tse\tseq\tsequ\tg\tng\ting\tcing\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "reads\treads\tr\tre\trea\tread\ts\tds\tads\teads\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "metagenomic\tmetagenomic\tm\tme\tmet\tmeta\tc\tic\tmic\tomic\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "samples\tsamples\ts\tsa\tsam\tsamp\ts\tes\tles\tples\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t1\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "used\tused\tu\tus\tuse\tused\td\ted\tsed\tused\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t1\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "this\tthis\tt\tth\tthi\tthis\ts\tis\this\tthis\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "study\tstudy\ts\tst\tstu\tstud\ty\tdy\tudy\ttudy\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "were\twere\tw\twe\twer\twere\te\tre\tere\twere\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "downloaded\tdownloaded\td\tdo\tdow\tdown\td\ted\tded\taded\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "from\tfrom\tf\tfr\tfro\tfrom\tm\tom\trom\tfrom\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "public\tpublic\tp\tpu\tpub\tpubl\tc\tic\tlic\tblic\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "repositories\trepositories\tr\tre\trep\trepo\ts\tes\ties\tries\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "listed\tlisted\tl\tli\tlis\tlist\td\ted\tted\tsted\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t1\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "following\tfollowing\tf\tfo\tfol\tfoll\tg\tng\ting\twing\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "publications\tpublications\tp\tpu\tpub\tpubl\ts\tns\tons\tions\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ":\t:\t:\t:\t:\t:\t:\t:\t:\t:\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tPUNCT\t0\t0\t1\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "1038\t1038\t1\t10\t103\t1038\t8\t38\t038\t1038\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "nature11209\tnature11209\tn\tna\tnat\tnatu\t9\t09\t209\t1209\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tCONTAINSDIGITS\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "1038\t1038\t1\t10\t103\t1038\t8\t38\t038\t1038\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "nature11450\tnature11450\tn\tna\tnat\tnatu\t0\t50\t450\t1450\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tCONTAINSDIGITS\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "1016\t1016\t1\t10\t101\t1016\t6\t16\t016\t1016\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "j\tj\tj\tj\tj\tj\tj\tj\tj\tj\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "cels\tcels\tc\tce\tcel\tcels\ts\tls\tels\tcels\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "2016\t2016\t2\t20\t201\t2016\t6\t16\t016\t2016\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "004\t004\t0\t00\t004\t004\t4\t04\t004\t004\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "1101\t1101\t1\t11\t110\t1101\t1\t01\t101\t1101\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "gr\tgr\tg\tgr\tgr\tgr\tr\tgr\tgr\tgr\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "233940\t233940\t2\t23\t233\t2339\t0\t40\t940\t3940\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "117\t117\t1\t11\t117\t117\t7\t17\t117\t117\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "Data\tdata\tD\tDa\tDat\tData\ta\tta\tata\tData\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "underlying\tunderlying\tu\tun\tund\tunde\tg\tng\ting\tying\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "all\tall\ta\tal\tall\tall\tl\tll\tall\tall\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "figures\tfigures\tf\tfi\tfig\tfigu\ts\tes\tres\tures\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "such\tsuch\ts\tsu\tsuc\tsuch\th\tch\tuch\tsuch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "as\tas\ta\tas\tas\tas\ts\tas\tas\tas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "numerical\tnumerical\tn\tnu\tnum\tnume\tl\tal\tcal\tical\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "values\tvalues\tv\tva\tval\tvalu\ts\tes\tues\tlues\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "bar\tbar\tb\tba\tbar\tbar\tr\tar\tbar\tbar\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "plots\tplots\tp\tpl\tplo\tplot\ts\tts\tots\tlots\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "can\tcan\tc\tca\tcan\tcan\tn\tan\tcan\tcan\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "be\tbe\tb\tbe\tbe\tbe\te\tbe\tbe\tbe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "found\tfound\tf\tfo\tfou\tfoun\td\tnd\tund\tound\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t1\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "5281\t5281\t5\t52\t528\t5281\t1\t81\t281\t5281\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "zenodo\tzenodo\tz\tze\tzen\tzeno\to\tdo\todo\tnodo\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "10304481\t10304481\t1\t10\t103\t1030\t1\t81\t481\t4481\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "All\tall\tA\tAl\tAll\tAll\tl\tll\tAll\tAll\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "other\tother\to\tot\toth\tothe\tr\ter\ther\tther\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "metadata\tmetadata\tm\tme\tmet\tmeta\ta\tta\tata\tdata\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "as\tas\ta\tas\tas\tas\ts\tas\tas\tas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "well\twell\tw\twe\twel\twell\tl\tll\tell\twell\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "as\tas\ta\tas\tas\tas\ts\tas\tas\tas\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "source\tsource\ts\tso\tsou\tsour\te\tce\trce\turce\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "code\tcode\tc\tco\tcod\tcode\te\tde\tode\tcode\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "sequencing\tsequencing\ts\tse\tseq\tsequ\tg\tng\ting\tcing\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "pipeline\tpipeline\tp\tpi\tpip\tpipe\te\tne\tine\tline\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "downstream\tdownstream\td\tdo\tdow\tdown\tm\tam\team\tream\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "analyses\tanalyses\ta\tan\tana\tanal\ts\tes\tses\tyses\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tCOMMA\t0\t0\t1\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "figure\tfigure\tf\tfi\tfig\tfigu\te\tre\ture\tgure\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "generation\tgeneration\tg\tge\tgen\tgene\tn\ton\tion\ttion\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "are\tare\ta\tar\tare\tare\te\tre\tare\tare\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "available\tavailable\ta\tav\tava\tavai\te\tle\tble\table\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "at\tat\ta\tat\tat\tat\tt\tat\tat\tat\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "Zenodo\tzenodo\tZ\tZe\tZen\tZeno\to\tdo\todo\tnodo\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tOPENBRACKET\t0\t0\t1\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "5281\t5281\t5\t52\t528\t5281\t1\t81\t281\t5281\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "zenodo\tzenodo\tz\tze\tzen\tzeno\to\tdo\todo\tnodo\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "10368227\t10368227\t1\t10\t103\t1036\t7\t27\t227\t8227\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t0\t0\t0\t1\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tENDBRACKET\t0\t0\t1\t0\t\n" + + "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKEND\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "GitHub\tgithub\tG\tGi\tGit\tGitH\tb\tub\tHub\ttHub\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tOPENBRACKET\t0\t0\t1\t0\t\n" + + "https\thttps\th\tht\thtt\thttp\ts\tps\ttps\tttps\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + ":\t:\t:\t:\t:\t:\t:\t:\t:\t:\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t1\tPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "github\tgithub\tg\tgi\tgit\tgith\tb\tub\thub\tthub\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t1\tDOT\t0\t0\t1\t0\t\n" + + "com\tcom\tc\tco\tcom\tcom\tm\tom\tcom\tcom\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "zhiru\tzhiru\tz\tzh\tzhi\tzhir\tu\tru\tiru\thiru\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t1\tHYPHEN\t0\t0\t1\t0\t\n" + + "liu\tliu\tl\tli\tliu\tliu\tu\tiu\tliu\tliu\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "microbiome_\tmicrobiome_\tm\tmi\tmic\tmicr\t_\te_\tme_\tome_\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t1\tNOPUNCT\t0\t0\t1\t0\t\n" + + "evolution\tevolution\te\tev\tevo\tevol\tn\ton\tion\ttion\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tENDBRACKET\t0\t0\t1\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKEND\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tDOT\t0\t0\t1\t0\t\n" + + "Funding\tfunding\tF\tFu\tFun\tFund\tg\tng\ting\tding\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tSAMEFONTSIZE\t1\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + ":\t:\t:\t:\t:\t:\t:\t:\t:\t:\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t1\t0\tALLCAP\tNODIGIT\t1\t0\t0\t0\t0\t0\t0\t0\tPUNCT\t0\t0\t1\t0\t\n" + + "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\tI-\n" + + "work\twork\tw\two\twor\twork\tk\trk\tork\twork\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t0\t0\t0\t0\tNOPUNCT\t0\t0\t1\t0\t\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\t1\t1\t0\t0\t1\t0\t0\tNOPUNCT\t0\t0\t1\t0\t" + val postprocessed = LabelUtils.postProcessFulltextCorrectSequencesWithoutInitialToken(resultHeader) + + assertThat( + Arrays.stream(StringUtils.split(postprocessed, "\n")) + .filter { l -> l.endsWith("") } + .count(), `is`(0L) + ) + assertThat( + Arrays.stream(StringUtils.split(postprocessed, "\n")) + .filter { l -> l.endsWith("") } + .count(), `is`(0L) + ) + + assertThat( + Arrays.stream(StringUtils.split(postprocessed, "\n")) + .filter { l -> l.endsWith("") } + .count(), `is`(139) + ) + } + + + companion object { + @JvmStatic + @BeforeAll + @Throws(Exception::class) + fun before() { + val modelParameters = ModelParameters() + modelParameters.name = "bao" + GrobidProperties.addModel(modelParameters) + } + } + +} \ No newline at end of file