sourcemeta · jviotti · Oct 14, 2025 · Sep 3, 2025 · Oct 14, 2025
diff --git a/src/core/uri/escaping.h b/src/core/uri/escaping.h
@@ -37,12 +37,14 @@ enum class URIEscapeMode : std::uint8_t {
 };
 
 inline auto uri_escape(std::istream &input, std::ostream &output,
-                       const URIEscapeMode mode) -> void {
+                       const URIEscapeMode mode,
+                       const bool preserve_percent_sequences = true) -> void {
   char character = 0;
   while (input.get(character)) {
-    // Check if this is an already percent-encoded sequence (%HEX HEX)
+    // Check if this is an already percent-encoded sequence (%HEXHEX)
     // If so, preserve it as-is to avoid double-encoding
-    if (character == URI_PERCENT) {
+    // (only when preserve_percent_sequences is true)
+    if (preserve_percent_sequences && character == URI_PERCENT) {
       const auto position = input.tellg();
       char next_1 = 0;
       char next_2 = 0;
@@ -131,14 +133,10 @@ inline auto uri_unescape(std::istream &input, std::ostream &output) -> void {
   }
 }
 
-// Selective unescaping for URI normalization (in-place modification)
-// Only unescapes unreserved characters, keeps reserved characters encoded
-// but normalizes hex digits to uppercase
-// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+// Full unescaping for URI normalization (in-place modification)
+// Decodes all percent-encoded sequences
 // Modifies the input string in-place for zero-copy performance
-inline auto uri_unescape_selective_inplace(std::string &str,
-                                           bool allow_colon_at = false)
-    -> void {
+inline auto uri_unescape_selective_inplace(std::string &str) -> void {
   std::string::size_type write_pos = 0;
 
   for (std::string::size_type read_pos = 0; read_pos < str.size();) {
@@ -165,24 +163,10 @@ inline auto uri_unescape_selective_inplace(std::string &str,
       const auto value = static_cast<unsigned char>(
           (hex_to_int(first_digit) << 4) | hex_to_int(second_digit));
 
-      // Decode unreserved characters: ALPHA / DIGIT / "-" / "." / "_" / "~"
-      // For URNs/tags, also decode ":" and "@"
-      const auto is_unreserved = uri_is_unreserved(static_cast<char>(value));
-      const auto is_urn_allowed =
-          allow_colon_at && (value == URI_COLON || value == URI_AT);
-
-      if (is_unreserved || is_urn_allowed) {
-        str[write_pos++] = static_cast<char>(value);
-        read_pos += 3;
-      } else {
-        // Keep it percent-encoded (but normalize to uppercase hex)
-        str[write_pos++] = URI_PERCENT;
-        str[write_pos++] = static_cast<char>(
-            std::toupper(static_cast<unsigned char>(first_digit)));
-        str[write_pos++] = static_cast<char>(
-            std::toupper(static_cast<unsigned char>(second_digit)));
-        read_pos += 3;
-      }
+      // Decode all percent-encoded sequences
+      // Internal storage is always fully decoded
+      str[write_pos++] = static_cast<char>(value);
+      read_pos += 3;
     } else {
       str[write_pos++] = str[read_pos++];
     }
@@ -191,14 +175,11 @@ inline auto uri_unescape_selective_inplace(std::string &str,
   str.resize(write_pos);
 }
 
-// Selective unescaping for URI normalization (copy version for compatibility)
-// Only unescapes unreserved characters, keeps reserved characters encoded
-// but normalizes hex digits to uppercase
-// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
-inline auto uri_unescape_selective(std::string_view input,
-                                   bool allow_colon_at = false) -> std::string {
+// Full unescaping for URI normalization (copy version for compatibility)
+// Decodes all percent-encoded sequences
+inline auto uri_unescape_selective(std::string_view input) -> std::string {
   std::string result{input};
-  uri_unescape_selective_inplace(result, allow_colon_at);
+  uri_unescape_selective_inplace(result);
   return result;
 }
 

diff --git a/src/core/uri/filesystem.cc b/src/core/uri/filesystem.cc
@@ -1,11 +1,8 @@
 #include <sourcemeta/core/uri.h>
 
-#include "escaping.h"
-
 #include <algorithm>  // std::ranges::replace
 #include <filesystem> // std::filesystem
 #include <iterator>   // std::advance, std::next
-#include <sstream>    // std::istringstream, std::ostringstream
 #include <string>     // std::string
 
 namespace sourcemeta::core {
@@ -28,11 +25,8 @@ auto URI::to_path() const -> std::filesystem::path {
     std::ranges::replace(path, '/', '\\');
   }
 
-  // Unescape percent-encoded characters
-  std::istringstream input{path};
-  std::ostringstream output;
-  uri_unescape(input, output);
-  return output.str();
+  // Path is already fully decoded, just return it
+  return path;
 }
 
 auto URI::from_path(const std::filesystem::path &path) -> URI {
@@ -60,10 +54,7 @@ auto URI::from_path(const std::filesystem::path &path) -> URI {
 
   // For UNC paths, the first segment is the hostname
   if (is_unc) {
-    std::istringstream input{iterator->string()};
-    std::ostringstream output;
-    uri_escape(input, output, URIEscapeMode::Filesystem);
-    result.host_ = output.str();
+    result.host_ = iterator->string();
     std::advance(iterator, 1);
   }
 
@@ -76,17 +67,14 @@ auto URI::from_path(const std::filesystem::path &path) -> URI {
         result.append_path("/");
       }
     } else {
-      // Escape the segment
-      std::istringstream input{iterator->string()};
-      std::ostringstream output;
-      uri_escape(input, output, URIEscapeMode::Filesystem);
-      const auto escaped_segment = output.str();
+      // Store raw segment - escaping will happen during recompose()
+      const auto segment = iterator->string();
 
       if (result.path_.has_value()) {
-        result.append_path(escaped_segment);
+        result.append_path(segment);
       } else {
         // First segment: file:// URIs need leading slash
-        result.path_ = "/" + escaped_segment;
+        result.path_ = "/" + segment;
       }
     }
   }

diff --git a/src/core/uri/parse.cc b/src/core/uri/parse.cc
@@ -358,11 +358,10 @@ auto URI::parse(const std::string &input) -> void {
     parse_authority(input, position, this->userinfo_, this->host_, this->port_);
   }
 
-  const auto is_urn_or_tag = this->is_urn() || this->is_tag();
   auto path = parse_path(input, position);
 
   if (path.has_value()) {
-    uri_unescape_selective_inplace(path.value(), is_urn_or_tag);
+    uri_unescape_selective_inplace(path.value());
     this->path_ = std::move(path.value());
   } else if (has_authority || this->scheme_.has_value()) {
     if (input.ends_with(URI_SLASH) || input == "/") {

diff --git a/src/core/uri/recompose.cc b/src/core/uri/recompose.cc
@@ -9,12 +9,23 @@
 
 namespace sourcemeta::core {
 
+namespace {
+
+auto escape_component(std::string_view input, URIEscapeMode mode)
+    -> std::string {
+  std::istringstream in{std::string{input}};
+  std::ostringstream out;
+  uri_escape(in, out, mode, false);
+  return out.str();
+}
+
+} // namespace
+
 auto URI::recompose() const -> std::string {
   const auto uri = this->recompose_without_fragment();
 
   // Fragment
-  const auto result_fragment = this->fragment();
-  if (!result_fragment.has_value()) {
+  if (!this->fragment_.has_value()) {
     return uri.value_or("");
   }
 
@@ -25,9 +36,10 @@ auto URI::recompose() const -> std::string {
 
   result << '#';
 
-  // Escape fragment using stream-based escaping with lookahead
-  std::istringstream fragment_input{std::string{result_fragment.value()}};
-  uri_escape(fragment_input, result, URIEscapeMode::Fragment);
+  // Escape fragment using stream-based escaping
+  // Don't preserve percent sequences since internal storage is fully decoded
+  std::istringstream fragment_input{std::string{this->fragment_.value()}};
+  uri_escape(fragment_input, result, URIEscapeMode::Fragment, false);
 
   return result.str();
 }
@@ -54,7 +66,8 @@ auto URI::recompose_without_fragment() const -> std::optional<std::string> {
   }
 
   if (user_info.has_value()) {
-    result << user_info.value() << "@";
+    result << escape_component(user_info.value(), URIEscapeMode::Fragment)
+           << "@";
   }
 
   // Host
@@ -67,7 +80,8 @@ auto URI::recompose_without_fragment() const -> std::optional<std::string> {
       // https://tools.ietf.org/html/rfc2732#section-2
       result << '[' << result_host.value() << ']';
     } else {
-      result << result_host.value();
+      result << escape_component(result_host.value(),
+                                 URIEscapeMode::SkipSubDelims);
     }
   }
 
@@ -87,16 +101,17 @@ auto URI::recompose_without_fragment() const -> std::optional<std::string> {
     // "h" not "/h")
     if (result_scheme.has_value() && !has_authority &&
         path_value.starts_with("/") && !path_value.starts_with("//")) {
-      result << path_value.substr(1);
+      result << escape_component(path_value.substr(1), URIEscapeMode::Fragment);
     } else {
-      result << path_value;
+      result << escape_component(path_value, URIEscapeMode::Fragment);
     }
   }
 
   // Query
   const auto result_query{this->query()};
   if (result_query.has_value()) {
-    result << '?' << result_query.value();
+    result << '?'
+           << escape_component(result_query.value(), URIEscapeMode::Fragment);
   }
 
   if (result.tellp() == 0) {

diff --git a/src/core/uri/setters.cc b/src/core/uri/setters.cc
@@ -35,11 +35,8 @@ auto normalize_fragment(std::string_view input) -> std::string {
     return "";
   }
 
-  if (input.starts_with('#')) {
-    return std::string{input.substr(1)};
-  }
-
-  return std::string{input};
+  // Strip leading '#' and store raw value
+  return std::string{input.starts_with('#') ? input.substr(1) : input};
 }
 
 } // namespace

diff --git a/test/jsonpointer/jsonpointer_to_uri_test.cc b/test/jsonpointer/jsonpointer_to_uri_test.cc
@@ -294,6 +294,13 @@ TEST(JSONPointer_to_uri, with_absolute_base) {
   EXPECT_EQ(fragment.recompose(), "https://www.example.com#/foo/bar");
 }
 
+TEST(JSONPointer_to_uri, with_absolute_base_percentage) {
+  const sourcemeta::core::Pointer pointer{"foo%bar"};
+  const sourcemeta::core::URI base{"https://www.example.com"};
+  const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer, base)};
+  EXPECT_EQ(fragment.recompose(), "https://www.example.com#/foo%25bar");
+}
+
 TEST(JSONPointer_to_uri, with_relative_base) {
   const sourcemeta::core::Pointer pointer{"foo", "bar"};
   const sourcemeta::core::URI base{"../baz"};

diff --git a/test/uri/uri_fragment_test.cc b/test/uri/uri_fragment_test.cc
@@ -22,7 +22,7 @@ TEST(URI_fragment, https_with_empty_fragment) {
 TEST(URI_fragment, https_with_escaped_jsonpointer) {
   const sourcemeta::core::URI uri{"https://example.com/#/c%25d"};
   EXPECT_TRUE(uri.fragment().has_value());
-  EXPECT_EQ(uri.fragment().value(), "/c%25d");
+  EXPECT_EQ(uri.fragment().value(), "/c%d");
 }
 
 TEST(URI_fragment, invalid_fragment_with_pointer) {
@@ -164,7 +164,55 @@ TEST(URI_fragment, set_pointer_at) {
 TEST(URI_fragment, set_pointer_bracket) {
   sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
   uri.fragment("/[foo/bar");
-  // Escaping should only happen during recomposing
   EXPECT_EQ(uri.fragment(), "/[foo/bar");
   EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/%5Bfoo/bar");
 }
+
+TEST(URI_fragment, set_pointer_percentage) {
+  sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
+  uri.fragment("/foo%bar");
+  EXPECT_EQ(uri.fragment(), "/foo%bar");
+  EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%25bar");
+}
+
+TEST(URI_fragment, set_percentage_at_end) {
+  sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
+  uri.fragment("/foo%");
+  EXPECT_EQ(uri.fragment(), "/foo%");
+  EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%25");
+}
+
+TEST(URI_fragment, set_percentage_with_one_hex) {
+  sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
+  uri.fragment("/foo%2");
+  EXPECT_EQ(uri.fragment(), "/foo%2");
+  EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%252");
+}
+
+TEST(URI_fragment, set_multiple_percentages) {
+  sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
+  uri.fragment("/foo%%bar");
+  EXPECT_EQ(uri.fragment(), "/foo%%bar");
+  EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%25%25bar");
+}
+
+TEST(URI_fragment, set_percentage_followed_by_valid_hex_sequence) {
+  sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
+  uri.fragment("/foo%2Fbar");
+  EXPECT_EQ(uri.fragment(), "/foo%2Fbar");
+  EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%252Fbar");
+}
+
+TEST(URI_fragment, set_space_character) {
+  sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
+  uri.fragment("/foo bar");
+  EXPECT_EQ(uri.fragment(), "/foo bar");
+  EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%20bar");
+}
+
+TEST(URI_fragment, set_non_ascii_character) {
+  sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
+  uri.fragment("/foo\xC3\xA9");
+  EXPECT_EQ(uri.fragment(), "/foo\xC3\xA9");
+  EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%C3%A9");
+}
diff --git a/test/uri/uri_from_path_test.cc b/test/uri/uri_from_path_test.cc
@@ -11,7 +11,7 @@ TEST(URI_from_path, unix_absolute) {
 TEST(URI_from_path, unix_with_space_and_reserved) {
   const std::filesystem::path example{"/foo/My Folder/has#hash?value%"};
   const auto uri{sourcemeta::core::URI::from_path(example)};
-  EXPECT_EQ(uri.recompose(), "file:///foo/My%20Folder/has%23hash%3Fvalue%25");
+  EXPECT_EQ(uri.recompose(), "file:///foo/My%20Folder/has%23hash?value%25");
 }
 
 TEST(URI_from_path, unix_trailing_slash) {