diff --git a/src/gleam/uri.gleam b/src/gleam/uri.gleam index 7af84032..40f3d5e5 100644 --- a/src/gleam/uri.gleam +++ b/src/gleam/uri.gleam @@ -58,59 +58,245 @@ pub fn parse(uri_string: String) -> Result(Uri, Nil) { } @external(erlang, "gleam_stdlib", "uri_parse") -fn do_parse(uri_string: String) -> Result(Uri, Nil) { - // From https://tools.ietf.org/html/rfc3986#appendix-B - let pattern = - // 12 3 4 5 6 7 8 - "^(([a-z][a-z0-9\\+\\-\\.]*):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#.*)?" - let matches = - pattern - |> regex_submatches(uri_string) - |> pad_list(8) - - let #(scheme, authority, path, query, fragment) = case matches { - [ - _scheme_with_colon, - scheme, - authority_with_slashes, - _authority, - path, - query_with_question_mark, - _query, - fragment, - ] -> #( - scheme, - authority_with_slashes, - path, - query_with_question_mark, - fragment, +pub fn do_parse(uri_string: String) -> Result(Uri, Nil) { + case parse_uri_pieces(uri_string) { + Error(Nil) -> Error(Nil) + Ok(UriPieces( + scheme: scheme, + authority_with_slashes: authority_with_slashes, + path: path, + query_with_question_mark: query_with_question_mark, + fragment: fragment, + )) -> { + let scheme = noneify_empty_string(scheme) + let query = noneify_query(query_with_question_mark) + let #(userinfo, host, port) = split_authority(authority_with_slashes) + let scheme = + scheme + |> noneify_empty_string + |> option.map(string.lowercase) + Ok(Uri( + scheme: scheme, + userinfo: userinfo, + host: host, + port: port, + path: path, + query: query, + fragment: fragment, + )) + } + } +} + +type UriPieces { + UriPieces( + scheme: Option(String), + authority_with_slashes: Option(String), + path: String, + query_with_question_mark: Option(String), + fragment: Option(String), + ) +} + +fn parse_uri_pieces(uri_string: String) -> Result(UriPieces, Nil) { + // This parses a uri_string following the regex defined in + // https://tools.ietf.org/html/rfc3986#appendix-B + // + // TODO: This is not perfect and will be more permissive than its Erlang + // counterpart, ideally we want to replicate Erlang's implementation on the js + // target as well. + let default_pieces = + UriPieces( + scheme: None, + authority_with_slashes: None, + path: "", + query_with_question_mark: None, + fragment: None, ) - _ -> #(None, None, None, None, None) + + parse_scheme_loop(uri_string, uri_string, default_pieces, 0) +} + +fn parse_scheme_loop( + original: String, + uri_string: String, + pieces: UriPieces, + size: Int, +) -> Result(UriPieces, Nil) { + case string.pop_grapheme(uri_string) { + // `/` is not allowed to appear in a scheme so we know it's over and we can + // start parsing the authority with slashes. + Ok(#("/", _)) if size == 0 -> + parse_authority_with_slashes(uri_string, pieces) + Ok(#("/", _)) -> { + let scheme = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, scheme: Some(scheme)) + parse_authority_with_slashes(uri_string, pieces) + } + + // `?` is not allowed to appear in a schemem, in an authority, or in a path; + // so if we see it we know it marks the beginning of the query part. + Ok(#("?", _)) if size == 0 -> + parse_query_with_question_mark(uri_string, pieces) + Ok(#("?", _)) -> { + let scheme = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, scheme: Some(scheme)) + parse_query_with_question_mark(uri_string, pieces) + } + + // `#` is not allowed to appear in a scheme, in an authority, in a path or + // in a query; so if we see it we know it marks the beginning of the final + // fragment. + Ok(#("#", rest)) if size == 0 -> parse_fragment(rest, pieces) + Ok(#("#", rest)) -> { + let scheme = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, scheme: Some(scheme)) + parse_fragment(rest, pieces) + } + + // A colon marks the end of a uri scheme, but if it is not preceded by any + // character then it's not a valid URI. + Ok(#(":", _)) if size == 0 -> Error(Nil) + Ok(#(":", rest)) -> { + let scheme = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, scheme: Some(scheme)) + parse_authority_with_slashes(rest, pieces) + } + + // In all other cases the first character is just a valid URI scheme + // character and we just keep munching characters until we reach the end of + // the uri scheme (or the end of the string and that would mean this is not + // a valid uri scheme since we found no `:`). + Ok(#(_, rest)) -> parse_scheme_loop(original, rest, pieces, size + 1) + + // If we could get to the end of the string and we've met no special + // chars whatsoever, that means the entire string is just a long path. + Error(_) -> Ok(UriPieces(..pieces, path: original)) + } +} + +fn parse_authority_with_slashes( + uri_string: String, + pieces: UriPieces, +) -> Result(UriPieces, Nil) { + case uri_string { + // To be a valid authority the string must start with a `//`, otherwise + // there's no authority and we just skip ahead to parsing the path. + "//" <> rest -> + parse_authority_with_slashes_loop(uri_string, rest, pieces, 2) + _ -> parse_path(uri_string, pieces) + } +} + +fn parse_authority_with_slashes_loop( + original: String, + uri_string: String, + pieces: UriPieces, + size: Int, +) -> Result(UriPieces, Nil) { + case string.pop_grapheme(uri_string) { + // `/` marks the beginning of a path. + Ok(#("/", _)) -> { + let authority = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, authority_with_slashes: Some(authority)) + parse_path(uri_string, pieces) + } + + // `?` marks the beginning of the query with question mark. + Ok(#("?", _)) -> { + let authority = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, authority_with_slashes: Some(authority)) + parse_query_with_question_mark(uri_string, pieces) + } + + // `#` marks the beginning of the fragment part. + Ok(#("#", rest)) -> { + let authority = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, authority_with_slashes: Some(authority)) + parse_fragment(rest, pieces) + } + + // In all other cases the character is allowed to be part of the authority + // so we just keep munching until we reach to its end. + Ok(#(_, rest)) -> + parse_authority_with_slashes_loop(original, rest, pieces, size + 1) + + // If the string is over that means the entirety of the string was the + // authority and it has an empty path, query and fragment. + Error(_) -> Ok(UriPieces(..pieces, authority_with_slashes: Some(original))) } +} + +fn parse_path(uri_string: String, pieces: UriPieces) -> Result(UriPieces, Nil) { + parse_path_loop(uri_string, uri_string, pieces, 0) +} + +fn parse_path_loop( + original: String, + uri_string: String, + pieces: UriPieces, + size: Int, +) -> Result(UriPieces, Nil) { + case string.pop_grapheme(uri_string) { + // `?` marks the beginning of the query with question mark. + Ok(#("?", _)) -> { + let path = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, path: path) + parse_query_with_question_mark(uri_string, pieces) + } + + // `#` marks the beginning of the fragment part. + Ok(#("#", rest)) -> { + let path = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, path: path) + parse_fragment(rest, pieces) + } + + // In all other cases the character is allowed to be part of the path so we + // just keep munching until we reach to its end. + Ok(#(_, rest)) -> parse_path_loop(original, rest, pieces, size + 1) + + // If the string is over that means the entirety of the string was the path + // and it has an empty query and fragment. + Error(_) -> Ok(UriPieces(..pieces, path: original)) + } +} + +fn parse_query_with_question_mark( + uri_string: String, + pieces: UriPieces, +) -> Result(UriPieces, Nil) { + parse_query_with_question_mark_loop(uri_string, uri_string, pieces, 0) +} + +fn parse_query_with_question_mark_loop( + original: String, + uri_string: String, + pieces: UriPieces, + size: Int, +) -> Result(UriPieces, Nil) { + case string.pop_grapheme(uri_string) { + // `#` marks the beginning of the fragment part. + Ok(#("#", rest)) -> { + let query = string.slice(original, at_index: 0, length: size) + let pieces = UriPieces(..pieces, query_with_question_mark: Some(query)) + parse_fragment(rest, pieces) + } + + // In all other cases the character is allowed to be part of the query so we + // just keep munching until we reach to its end. + Ok(#(_, rest)) -> + parse_query_with_question_mark_loop(original, rest, pieces, size + 1) + + // If the string is over that means the entirety of the string was the query + // and it has an empty fragment. + Error(_) -> + Ok(UriPieces(..pieces, query_with_question_mark: Some(original))) + } +} - let scheme = noneify_empty_string(scheme) - let path = option.unwrap(path, "") - let query = noneify_query(query) - let #(userinfo, host, port) = split_authority(authority) - let fragment = - fragment - |> option.to_result(Nil) - |> result.try(string.pop_grapheme) - |> result.map(pair.second) - |> option.from_result - let scheme = - scheme - |> noneify_empty_string - |> option.map(string.lowercase) - Ok(Uri( - scheme: scheme, - userinfo: userinfo, - host: host, - port: port, - path: path, - query: query, - fragment: fragment, - )) +fn parse_fragment(rest: String, pieces: UriPieces) -> Result(UriPieces, Nil) { + Ok(UriPieces(..pieces, fragment: Some(rest))) } fn regex_submatches(pattern: String, string: String) -> List(Option(String)) { diff --git a/src/gleam_stdlib.mjs b/src/gleam_stdlib.mjs index a70309e5..d49b63ba 100644 --- a/src/gleam_stdlib.mjs +++ b/src/gleam_stdlib.mjs @@ -1004,3 +1004,15 @@ export function bit_array_starts_with(bits, prefix) { return true; } + +export function first_byte(string) { + return string.slice(0, 1); +} + +export function take_bytes(string, size) { + return string.slice(0, size); +} + +export function drop_byte(string) { + return string.slice(1) +} \ No newline at end of file diff --git a/test/gleam/uri_test.gleam b/test/gleam/uri_test.gleam index 267c0d4d..77205f9e 100644 --- a/test/gleam/uri_test.gleam +++ b/test/gleam/uri_test.gleam @@ -8,7 +8,7 @@ import gleam/uri pub fn full_parse_test() { let assert Ok(parsed) = - uri.parse("https://weebl:bob@example.com:1234/path?query=true#fragment") + uri.alt_parse("https://weebl:bob@example.com:1234/path?query=true#fragment") should.equal(parsed.scheme, Some("https")) should.equal(parsed.userinfo, Some("weebl:bob")) should.equal(parsed.host, Some("example.com")) @@ -19,7 +19,7 @@ pub fn full_parse_test() { } pub fn parse_only_path_test() { - let assert Ok(parsed) = uri.parse("") + let assert Ok(parsed) = uri.alt_parse("") should.equal(parsed.scheme, None) should.equal(parsed.userinfo, None) should.equal(parsed.host, None) @@ -30,7 +30,7 @@ pub fn parse_only_path_test() { } pub fn parse_only_host_test() { - let assert Ok(parsed) = uri.parse("//") + let assert Ok(parsed) = uri.alt_parse("//") should.equal(parsed.scheme, None) should.equal(parsed.userinfo, None) should.equal(parsed.host, Some("")) @@ -41,7 +41,7 @@ pub fn parse_only_host_test() { } pub fn parse_scheme_test() { - uri.parse("http://one.com/path/to/something?one=two&two=one#fragment") + uri.alt_parse("http://one.com/path/to/something?one=two&two=one#fragment") |> should.equal( Ok(uri.Uri( scheme: Some("http"), @@ -56,7 +56,7 @@ pub fn parse_scheme_test() { } pub fn parse_https_scheme_test() { - uri.parse("https://foo.com") + uri.alt_parse("https://foo.com") |> should.equal( Ok(uri.Uri( scheme: Some("https"), @@ -71,7 +71,7 @@ pub fn parse_https_scheme_test() { } pub fn parse_file_scheme_test() { - uri.parse("file:///one/two/three") + uri.alt_parse("file:///one/two/three") |> should.equal( Ok(uri.Uri( scheme: Some("file"), @@ -87,7 +87,7 @@ pub fn parse_file_scheme_test() { pub fn parse_ftp_scheme_test() { "ftp://user001:password@private.ftp-server.example.com/my_directory/my_file.txt" - |> uri.parse + |> uri.alt_parse |> should.equal( Ok(uri.Uri( scheme: Some("ftp"), @@ -103,7 +103,7 @@ pub fn parse_ftp_scheme_test() { pub fn parse_sftp_scheme_test() { "sftp://user001:password@private.ftp-server.example.com/my_directory/my_file.txt" - |> uri.parse + |> uri.alt_parse |> should.equal( Ok(uri.Uri( scheme: Some("sftp"), @@ -119,7 +119,7 @@ pub fn parse_sftp_scheme_test() { pub fn parse_tftp_scheme_test() { "tftp://user001:password@private.ftp-server.example.com/my_directory/my_file.txt" - |> uri.parse + |> uri.alt_parse |> should.equal( Ok(uri.Uri( scheme: Some("tftp"), @@ -135,7 +135,7 @@ pub fn parse_tftp_scheme_test() { pub fn parse_ldap_scheme_test() { "ldap:///dc=example,dc=com??sub?(givenName=John)" - |> uri.parse + |> uri.alt_parse |> should.equal( Ok(uri.Uri( scheme: Some("ldap"), @@ -151,7 +151,7 @@ pub fn parse_ldap_scheme_test() { pub fn parse_ldap_2_scheme_test() { "ldap://ldap.example.com/cn=John%20Doe,dc=foo,dc=com" - |> uri.parse + |> uri.alt_parse |> should.equal( Ok(uri.Uri( scheme: Some("ldap"), @@ -166,7 +166,7 @@ pub fn parse_ldap_2_scheme_test() { } fn assert_parse(s) { - let assert Ok(u) = uri.parse(s) + let assert Ok(u) = uri.alt_parse(s) u } @@ -177,8 +177,9 @@ fn assert_parse(s) { // assert ":https" = uri.parse(":https").path // assert "https" = uri.parse("https").path // } + pub fn parse_downcases_scheme() { - let assert Ok(uri) = uri.parse("HTTPS://EXAMPLE.COM") + let assert Ok(uri) = uri.alt_parse("HTTPS://EXAMPLE.COM") let assert Some("https") = uri.scheme let assert Some("EXAMPLE.COM") = uri.host } @@ -408,160 +409,160 @@ pub fn parse_segments_test() { } pub fn origin1_test() { - let assert Ok(parsed) = uri.parse("http://example.test/path?weebl#bob") + let assert Ok(parsed) = uri.alt_parse("http://example.test/path?weebl#bob") uri.origin(parsed) |> should.equal(Ok("http://example.test")) } pub fn origin2_test() { - let assert Ok(parsed) = uri.parse("http://example.test:8080") + let assert Ok(parsed) = uri.alt_parse("http://example.test:8080") uri.origin(parsed) |> should.equal(Ok("http://example.test:8080")) } pub fn origin3_test() { - let assert Ok(parsed) = uri.parse("https://example.test") + let assert Ok(parsed) = uri.alt_parse("https://example.test") uri.origin(parsed) |> should.equal(Ok("https://example.test")) } pub fn origin4_test() { - let assert Ok(parsed) = uri.parse("http:///path") + let assert Ok(parsed) = uri.alt_parse("http:///path") uri.origin(parsed) |> should.equal(Ok("http://")) } pub fn origin5_test() { - let assert Ok(parsed) = uri.parse("http://") + let assert Ok(parsed) = uri.alt_parse("http://") uri.origin(parsed) |> should.equal(Ok("http://")) } pub fn origin6_test() { - let assert Ok(parsed) = uri.parse("/path") + let assert Ok(parsed) = uri.alt_parse("/path") uri.origin(parsed) |> should.equal(Error(Nil)) } pub fn origin7_test() { - let assert Ok(parsed) = uri.parse("file:///dev/null") + let assert Ok(parsed) = uri.alt_parse("file:///dev/null") uri.origin(parsed) |> should.equal(Error(Nil)) } pub fn origin8_test() { - let assert Ok(parsed) = uri.parse("https://mozilla.org:443/") + let assert Ok(parsed) = uri.alt_parse("https://mozilla.org:443/") uri.origin(parsed) |> should.equal(Ok("https://mozilla.org")) } pub fn origin9_test() { - let assert Ok(parsed) = uri.parse("http://localhost:80/") + let assert Ok(parsed) = uri.alt_parse("http://localhost:80/") uri.origin(parsed) |> should.equal(Ok("http://localhost")) } pub fn merge1_test() { - let assert Ok(a) = uri.parse("/relative") - let assert Ok(b) = uri.parse("") + let assert Ok(a) = uri.alt_parse("/relative") + let assert Ok(b) = uri.alt_parse("") uri.merge(a, b) |> should.equal(Error(Nil)) } pub fn merge2_test() { - let assert Ok(a) = uri.parse("http://google.com/weebl") - let assert Ok(b) = uri.parse("http://example.com/baz") + let assert Ok(a) = uri.alt_parse("http://google.com/weebl") + let assert Ok(b) = uri.alt_parse("http://example.com/baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(uri.alt_parse("http://example.com/baz")) } pub fn merge3_test() { - let assert Ok(a) = uri.parse("http://google.com/weebl") - let assert Ok(b) = uri.parse("http://example.com/.././bob/../../baz") + let assert Ok(a) = uri.alt_parse("http://google.com/weebl") + let assert Ok(b) = uri.alt_parse("http://example.com/.././bob/../../baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(uri.alt_parse("http://example.com/baz")) } pub fn merge4_test() { - let assert Ok(a) = uri.parse("http://google.com/weebl") - let assert Ok(b) = uri.parse("//example.com/baz") + let assert Ok(a) = uri.alt_parse("http://google.com/weebl") + let assert Ok(b) = uri.alt_parse("//example.com/baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(uri.alt_parse("http://example.com/baz")) } pub fn merge5_test() { - let assert Ok(a) = uri.parse("http://google.com/weebl") - let assert Ok(b) = uri.parse("//example.com/.././bob/../../../baz") + let assert Ok(a) = uri.alt_parse("http://google.com/weebl") + let assert Ok(b) = uri.alt_parse("//example.com/.././bob/../../../baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(uri.alt_parse("http://example.com/baz")) } pub fn merge6_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/bob") - let assert Ok(b) = uri.parse("/baz") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/bob") + let assert Ok(b) = uri.alt_parse("/baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(uri.alt_parse("http://example.com/baz")) } pub fn merge7_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/bob") - let assert Ok(b) = uri.parse("baz") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/bob") + let assert Ok(b) = uri.alt_parse("baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/baz")) + |> should.equal(uri.alt_parse("http://example.com/weebl/baz")) } pub fn merge8_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/") - let assert Ok(b) = uri.parse("baz") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/") + let assert Ok(b) = uri.alt_parse("baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/baz")) + |> should.equal(uri.alt_parse("http://example.com/weebl/baz")) } pub fn merge9_test() { - let assert Ok(a) = uri.parse("http://example.com") - let assert Ok(b) = uri.parse("baz") + let assert Ok(a) = uri.alt_parse("http://example.com") + let assert Ok(b) = uri.alt_parse("baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(uri.alt_parse("http://example.com/baz")) } pub fn merge10_test() { - let assert Ok(a) = uri.parse("http://example.com") - let assert Ok(b) = uri.parse("/.././bob/../../../baz") + let assert Ok(a) = uri.alt_parse("http://example.com") + let assert Ok(b) = uri.alt_parse("/.././bob/../../../baz") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/baz")) + |> should.equal(uri.alt_parse("http://example.com/baz")) } pub fn merge11_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/bob") - let assert Ok(b) = uri.parse("") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/bob") + let assert Ok(b) = uri.alt_parse("") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob")) + |> should.equal(uri.alt_parse("http://example.com/weebl/bob")) } pub fn merge12_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/bob") - let assert Ok(b) = uri.parse("#fragment") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/bob") + let assert Ok(b) = uri.alt_parse("#fragment") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob#fragment")) + |> should.equal(uri.alt_parse("http://example.com/weebl/bob#fragment")) } pub fn merge13_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/bob") - let assert Ok(b) = uri.parse("?query") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/bob") + let assert Ok(b) = uri.alt_parse("?query") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob?query")) + |> should.equal(uri.alt_parse("http://example.com/weebl/bob?query")) } pub fn merge14_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/bob?query1") - let assert Ok(b) = uri.parse("?query2") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/bob?query1") + let assert Ok(b) = uri.alt_parse("?query2") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob?query2")) + |> should.equal(uri.alt_parse("http://example.com/weebl/bob?query2")) } pub fn merge15_test() { - let assert Ok(a) = uri.parse("http://example.com/weebl/bob?query") - let assert Ok(b) = uri.parse("") + let assert Ok(a) = uri.alt_parse("http://example.com/weebl/bob?query") + let assert Ok(b) = uri.alt_parse("") uri.merge(a, b) - |> should.equal(uri.parse("http://example.com/weebl/bob?query")) + |> should.equal(uri.alt_parse("http://example.com/weebl/bob?query")) }