Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve external link parsing #702

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ object TemplateTransformConfig {
private def extractFirstExternalLinkNode(node: Option[PropertyNode]) : Option[ExternalLinkNode] = {
node
.flatMap(_.children
.map(c => {
if (c.isInstanceOf[TextNode] && c.toPlainText.contains(".") && !c.toPlainText.contains(" ")) {
val text = c.toPlainText
val triedUri = UriUtils.createURI(if (!text.startsWith("http") && !text.contains(":")) "http://" + text else text)
triedUri.map(uri => ExternalLinkNode(uri, c.children, c.line)).getOrElse(c)
} else c
})
.filter(c => c.isInstanceOf[ExternalLinkNode])
.map(_.asInstanceOf[ExternalLinkNode])
.headOption
Expand Down Expand Up @@ -169,7 +176,8 @@ object TemplateTransformConfig {
PropertyNode("link-title", List(TextNode("", node.line)), node.line)
}

// Check if this uri has a scheme. If it does not, add a default http:// scheme

// Check if this uri has a scheme. If it does not, add a default http:// scheme
// From https://en.wikipedia.org/wiki/Template:URL:
// The first parameter is parsed to see if it takes the form of a complete URL.
// If it doesn't start with a URI scheme (such as "http:", "https:", or "ftp:"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ object SimpleWikiParser
private val externalLinkLabelOrEnd = new Matcher(List(" ", "]", "\n"))
private val externalLinkEnd = new Matcher(List("]", "\n"), true)

private val linkEnd = new Matcher(List(" ", "{","}", "[", "]", "\n", "\t"))
private val linkEnd = new Matcher(List(" ", "{","}", "[", "]", "|", "\n", "\t"))

// '|=' is not valid wiki markup but safe to include, see http://sourceforge.net/tracker/?func=detail&atid=935521&aid=3572779&group_id=190976
private val propertyValueOrEnd = new Matcher(List("|=","=", "|", "}}"), true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class LinkParserTest extends FlatSpec with Matchers
}

it should "return http://EXAMPLE.COM" in {
parse("{{URL|EXAMPLE.com}}") should equal (Some(build("http://EXAMPLE.COM")))
parse("{{URL|EXAMPLE.COM}}") should equal (Some(build("http://EXAMPLE.COM")))
}

it should "return http://www.example.com" in {
Expand Down Expand Up @@ -100,17 +100,20 @@ class LinkParserTest extends FlatSpec with Matchers
private val parser = WikiParser.getInstance()
private val notStrictParser = new LinkParser(strict = false)

private def build(uri: String) : URI = {
URI.create(uri)
private def build(uri: String) : String = {
URI.create(uri).toString
}

private def parse(input : String) : Option[IRI] =
private def parse(input : String) : Option[String] =
{
val page = new WikiPage(WikiTitle.parse("TestPage", Language.English), input)

// Not strict parsing
parser(page) match {
case Some(n) => notStrictParser.parse(n).map(_.value)
case Some(n) => {
val option = notStrictParser.parse(n)
option.map(_.value.toString)
}
case None => None
}
}
Expand Down
10 changes: 8 additions & 2 deletions dump/src/test/resources/shaclTestsCoverageTable.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@ wikipage-uri|shacl-test|issue|comment
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation |
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last1](http://dbpedia.org/property/last1) #Citation_english_language_last1_datatype_validation |
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last](http://dbpedia.org/property/last) #Citation_english_language_last_datatype_validation |
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/page](http://dbpedia.org/property/page) #Citation_english_language_page_datatype_validation |
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation |
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation |
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/year](http://dbpedia.org/property/year) #Citation_english_languagа_year_datatype_validation |
[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://www.w3.org/2003/01/geo/wgs84_pos#long](http://www.w3.org/2003/01/geo/wgs84_pos#long) #wgs84_lat_long | | generic test for range of wgs84 lat/long |
[http://en.dbpedia.org/resource/Atlantic_Ocean](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Atlantic_Ocean&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation |
[http://en.dbpedia.org/resource/Atlantic_Ocean](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Atlantic_Ocean&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation |
Expand Down Expand Up @@ -117,12 +119,16 @@ wikipage-uri|shacl-test|issue|comment
[http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation |
[http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation |
[http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://www.w3.org/2003/01/geo/wgs84_pos#long](http://www.w3.org/2003/01/geo/wgs84_pos#long) #wgs84_lat_long | | generic test for range of wgs84 lat/long |
[http://en.dbpedia.org/resource/Mini_(Mark_I)](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Mini_(Mark_I)&revid=&format=trix&extractors=custom) |
[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) |
[http://en.dbpedia.org/resource/Mini_(Mark_I)](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Mini_(Mark_I)&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation |
[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation |
[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation |
[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation |
[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation |
[http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation |
[http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation |
[http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last](http://dbpedia.org/property/last) #Citation_english_language_last_datatype_validation |
[http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation |
[http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation |
[http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation |
[http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation |
[http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/isbn](http://dbpedia.org/property/isbn) #en_property_isbn_citation |
Expand Down