Skip to content

Commit

Permalink
Optimized parsing of values and fixed bug, where links in italics wer…
Browse files Browse the repository at this point in the history
…en't parsed correctly.
  • Loading branch information
HerrKnarz committed Sep 28, 2024
1 parent 7ceec67 commit 003e96e
Showing 1 changed file with 12 additions and 15 deletions.
27 changes: 12 additions & 15 deletions Metadata/WikipediaMetadata/WikitextParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,15 @@ internal List<MetadataNameProperty> CleanUpAndSplit(TemplateArgument argument, s
var values = new List<MetadataNameProperty>();

// If the value is the only one and is a link, we return it without splitting.
if (value.Count(c => c == '[') == 2 && value.Count(c => c == ']') == 2 && value.StartsWith("[") && value.EndsWith("]"))
if (IsSingleLink(value))
{
values.Add(new MetadataNameProperty(parser.Parse(value).ToPlainText(NodePlainTextOptions.RemoveRefTags).Trim()));
return values;
}

// Now the build the list of separators to split the values by.
value = parser.Parse(value).ToPlainText(NodePlainTextOptions.RemoveRefTags);

// Now we build the list of separators to split the values by.
var separators = new List<string>();

separators.AddRange(Resources.StringSeparators);
Expand All @@ -178,9 +180,9 @@ internal List<MetadataNameProperty> CleanUpAndSplit(TemplateArgument argument, s

// Now we split the values by the list of separators and parse the result to get the plain text values.
values.AddRange(value.Split(separators.ToArray(), 100, StringSplitOptions.RemoveEmptyEntries)
.Select(segment => parser.Parse(segment).ToPlainText(NodePlainTextOptions.RemoveRefTags).Trim())
.Where(segmentEditable => segmentEditable.Length > 0)
.Select(segmentEditable => new MetadataNameProperty(segmentEditable)));
.Select(segment => segment.Trim())
.Where(segment => segment.Length > 0)
.Select(segment => new MetadataNameProperty(segment)));

return values;
}
Expand Down Expand Up @@ -476,24 +478,19 @@ internal List<MetadataProperty> GetValues(Template infoBox, string field, bool r
/// <returns>The cleaned up argument</returns>
internal TemplateArgument StripUnwantedElements(TemplateArgument argument)
{
// First we remove every template we don't want.
foreach (var item in argument.EnumDescendants().OfType<Template>().Where(t =>
Resources.UnwantedTemplateNames.Contains(
CleanTemplateName(MwParserUtility.NormalizeTemplateArgumentName(t.Name)))).ToList())
{
item.Remove();
}

// Now we also remove <ref> tags, because those contain footnotes etc., we don't need.
foreach (var line in argument.Value.Lines)
{
foreach (var item in line.EnumDescendants().Where(t => t.ToString().StartsWith("<ref")).ToList())
{
item.Remove();
}
}

return argument;
}

private static bool IsSingleLink(string value) => value.Count(c => c == '[') == 2 &&
value.Count(c => c == ']') == 2 &&
((value.StartsWith("[") && value.EndsWith("]")) ||
(value.StartsWith("''[[") && value.EndsWith("]]''")));
}
}

0 comments on commit 003e96e

Please sign in to comment.