This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a7e09412a2 fix(spark): return input string for PATH/FILE on schemeless
URLs in `parse_url` (#20506)
a7e09412a2 is described below
commit a7e09412a2b7e688e87d87b580884a8dc938234f
Author: David López <[email protected]>
AuthorDate: Mon Mar 16 21:46:50 2026 +0100
fix(spark): return input string for PATH/FILE on schemeless URLs in
`parse_url` (#20506)
## Which issue does this PR close?
- NA
## Rationale for this change
Spark's `java.net.URI` treats schemeless strings (e.g. `'notaurl'`) as
relative URIs where the entire input becomes the path component. The
Rust `url` crate rejects these with `RelativeUrlWithoutBase`, and the
current implementation maps all such errors to `NULL` — but Spark
returns the input string for `PATH` and `FILE`.
## What changes are included in this PR?
- In `parse_url.rs`, when catching `RelativeUrlWithoutBase` for
schemeless URLs, return the input string for `PATH` and `FILE` parts
instead of `NULL`
- Updated unit tests and sqllogictests for both `parse_url` and
`try_parse_url`
## Are these changes tested?
Yes:
- Unit test `test_parse_schemeless_url` covers all 8 URL parts against a
schemeless input
- sqllogictest coverage in `parse_url.slt` and `try_parse_url.slt`
## Are there any user-facing changes?
Yes — `parse_url('notaurl', 'PATH')` and `parse_url('notaurl', 'FILE')`
now return `'notaurl'` instead of `NULL`, matching Spark behavior.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/spark/src/function/url/parse_url.rs | 126 ++++++++++++++++++-
.../test_files/spark/url/parse_url.slt | 133 +++++++++++++++++++++
.../test_files/spark/url/try_parse_url.slt | 133 +++++++++++++++++++++
3 files changed, 387 insertions(+), 5 deletions(-)
diff --git a/datafusion/spark/src/function/url/parse_url.rs
b/datafusion/spark/src/function/url/parse_url.rs
index 7beb02f775..50591fb25e 100644
--- a/datafusion/spark/src/function/url/parse_url.rs
+++ b/datafusion/spark/src/function/url/parse_url.rs
@@ -84,7 +84,35 @@ impl ParseUrl {
let url: std::result::Result<Url, ParseError> = Url::parse(value);
if let Err(ParseError::RelativeUrlWithoutBase) = url {
return if !value.contains("://") {
- Ok(None)
+ // Schemeless URLs are treated as relative URIs (like
java.net.URI).
+ // Manually parse path, query, and fragment components.
+ let (without_fragment, fragment) = match value.split_once('#')
{
+ Some((before, frag)) => (before, Some(frag)),
+ None => (value, None),
+ };
+ let (path, query) = match without_fragment.split_once('?') {
+ Some((p, q)) => (p, Some(q)),
+ None => (without_fragment, None),
+ };
+ Ok(match part {
+ "PATH" => Some(path.to_string()),
+ "QUERY" => match key {
+ None => query.map(String::from),
+ Some(key) => query.and_then(|q| {
+ q.split('&')
+ .filter_map(|pair| pair.split_once('='))
+ .find(|(k, _)| *k == key)
+ .map(|(_, v)| v.to_string())
+ }),
+ },
+ "REF" => fragment.map(String::from),
+ "FILE" => {
+ // FILE = path + query (without fragment)
+ Some(without_fragment.to_string())
+ }
+ // HOST, PROTOCOL, AUTHORITY, USERINFO → NULL
+ _ => None,
+ })
} else {
Err(exec_datafusion_err!(
"The url is invalid: {value}. Use `try_parse_url` to
tolerate invalid URL and return NULL instead. SQLSTATE: 22P02"
@@ -199,6 +227,7 @@ pub fn spark_handled_parse_url(
as_string_array(part)?,
as_string_array(key)?,
handler_err,
+ true,
)
}
(DataType::Utf8View, DataType::Utf8View, DataType::Utf8View) => {
@@ -207,6 +236,7 @@ pub fn spark_handled_parse_url(
as_string_view_array(part)?,
as_string_view_array(key)?,
handler_err,
+ true,
)
}
(DataType::LargeUtf8, DataType::LargeUtf8, DataType::LargeUtf8) =>
{
@@ -215,6 +245,7 @@ pub fn spark_handled_parse_url(
as_large_string_array(part)?,
as_large_string_array(key)?,
handler_err,
+ true,
)
}
_ => exec_err!(
@@ -240,6 +271,7 @@ pub fn spark_handled_parse_url(
as_string_array(part)?,
&key,
handler_err,
+ false,
)
}
(DataType::Utf8View, DataType::Utf8View) => {
@@ -248,6 +280,7 @@ pub fn spark_handled_parse_url(
as_string_view_array(part)?,
&key,
handler_err,
+ false,
)
}
(DataType::LargeUtf8, DataType::LargeUtf8) => {
@@ -256,6 +289,7 @@ pub fn spark_handled_parse_url(
as_large_string_array(part)?,
&key,
handler_err,
+ false,
)
}
_ => exec_err!(
@@ -272,6 +306,7 @@ fn process_parse_url<'a, A, B, C, T>(
part_array: &'a B,
key_array: &'a C,
handle: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+ has_key_arg: bool,
) -> Result<ArrayRef>
where
&'a A: StringArrayType<'a>,
@@ -284,7 +319,11 @@ where
.zip(part_array.iter())
.zip(key_array.iter())
.map(|((url, part), key)| {
- if let (Some(url), Some(part), key) = (url, part, key) {
+ // Spark returns NULL when the third argument is explicitly NULL
+ if has_key_arg && key.is_none() {
+ return Ok(None);
+ }
+ if let (Some(url), Some(part)) = (url, part) {
handle(ParseUrl::parse(url, part, key))
} else {
Ok(None)
@@ -357,9 +396,86 @@ mod tests {
}
#[test]
- fn test_parse_malformed_url_returns_error() -> Result<()> {
- let got = ParseUrl::parse("notaurl", "HOST", None)?;
- assert_eq!(got, None);
+ fn test_parse_schemeless_url() -> Result<()> {
+ // Spark's java.net.URI treats schemeless strings as relative URIs.
+ // Simple schemeless string: no query, no fragment.
+ assert_eq!(
+ ParseUrl::parse("notaurl", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl", "FILE", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(ParseUrl::parse("notaurl", "HOST", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "PROTOCOL", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "REF", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "AUTHORITY", None)?, None);
+ assert_eq!(ParseUrl::parse("notaurl", "USERINFO", None)?, None);
+
+ // Schemeless URL with query string
+ assert_eq!(
+ ParseUrl::parse("notaurl?key=value", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?key=value", "FILE", None)?,
+ Some("notaurl?key=value".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?key=value", "QUERY", None)?,
+ Some("key=value".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?key=value", "QUERY", Some("key"))?,
+ Some("value".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?key=value", "QUERY", Some("missing"))?,
+ None
+ );
+ assert_eq!(ParseUrl::parse("notaurl?key=value", "HOST", None)?, None);
+ assert_eq!(
+ ParseUrl::parse("notaurl?key=value", "PROTOCOL", None)?,
+ None
+ );
+
+ // Schemeless URL with fragment
+ assert_eq!(
+ ParseUrl::parse("notaurl#reference", "REF", None)?,
+ Some("reference".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl#reference", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl#reference", "FILE", None)?,
+ Some("notaurl".to_string())
+ );
+
+ // Schemeless URL with both query and fragment
+ assert_eq!(
+ ParseUrl::parse("notaurl?a=1&b=2#frag", "PATH", None)?,
+ Some("notaurl".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", None)?,
+ Some("a=1&b=2".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", Some("b"))?,
+ Some("2".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?a=1&b=2#frag", "REF", None)?,
+ Some("frag".to_string())
+ );
+ assert_eq!(
+ ParseUrl::parse("notaurl?a=1&b=2#frag", "FILE", None)?,
+ Some("notaurl?a=1&b=2".to_string())
+ );
Ok(())
}
diff --git a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
index f2dc55f755..7a5051d50e 100644
--- a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
@@ -140,6 +140,96 @@ SELECT parse_url('notaurl', 'host');
----
NULL
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
query T
SELECT parse_url('https://example.com', 'PATH');
----
@@ -175,3 +265,46 @@ SELECT parse_url();
query error DataFusion error: Execution error: The url is invalid: inva
lid://spark\.apache\.org/path\?query=1\. Use `try_parse_url` to tolerate
invalid URL and return NULL instead\. SQLSTATE: 22P02
SELECT parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third
arg is NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT parse_url(NULL, NULL, NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
index 403747c63c..a0e42a1648 100644
--- a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
@@ -91,6 +91,96 @@ SELECT try_parse_url('notaurl', 'host');
----
NULL
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT try_parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT try_parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT try_parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT try_parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT try_parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
query T
SELECT try_parse_url('https://example.com', 'PATH');
----
@@ -125,3 +215,46 @@ query T
SELECT try_parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
----
NULL
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT try_parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT try_parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT try_parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third
arg is NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT try_parse_url(NULL, NULL, NULL);
+----
+NULL
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]