Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 16 additions & 35 deletions src/core/uri/escaping.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ enum class URIEscapeMode : std::uint8_t {
};

inline auto uri_escape(std::istream &input, std::ostream &output,
const URIEscapeMode mode) -> void {
const URIEscapeMode mode,
const bool preserve_percent_sequences = true) -> void {
char character = 0;
while (input.get(character)) {
// Check if this is an already percent-encoded sequence (%HEX HEX)
// Check if this is an already percent-encoded sequence (%HEXHEX)
// If so, preserve it as-is to avoid double-encoding
if (character == URI_PERCENT) {
// (only when preserve_percent_sequences is true)
if (preserve_percent_sequences && character == URI_PERCENT) {
const auto position = input.tellg();
char next_1 = 0;
char next_2 = 0;
Expand Down Expand Up @@ -131,14 +133,10 @@ inline auto uri_unescape(std::istream &input, std::ostream &output) -> void {
}
}

// Selective unescaping for URI normalization (in-place modification)
// Only unescapes unreserved characters, keeps reserved characters encoded
// but normalizes hex digits to uppercase
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
// Full unescaping for URI normalization (in-place modification)
// Decodes all percent-encoded sequences
// Modifies the input string in-place for zero-copy performance
inline auto uri_unescape_selective_inplace(std::string &str,
bool allow_colon_at = false)
-> void {
inline auto uri_unescape_selective_inplace(std::string &str) -> void {
std::string::size_type write_pos = 0;

for (std::string::size_type read_pos = 0; read_pos < str.size();) {
Expand All @@ -165,24 +163,10 @@ inline auto uri_unescape_selective_inplace(std::string &str,
const auto value = static_cast<unsigned char>(
(hex_to_int(first_digit) << 4) | hex_to_int(second_digit));

// Decode unreserved characters: ALPHA / DIGIT / "-" / "." / "_" / "~"
// For URNs/tags, also decode ":" and "@"
const auto is_unreserved = uri_is_unreserved(static_cast<char>(value));
const auto is_urn_allowed =
allow_colon_at && (value == URI_COLON || value == URI_AT);

if (is_unreserved || is_urn_allowed) {
str[write_pos++] = static_cast<char>(value);
read_pos += 3;
} else {
// Keep it percent-encoded (but normalize to uppercase hex)
str[write_pos++] = URI_PERCENT;
str[write_pos++] = static_cast<char>(
std::toupper(static_cast<unsigned char>(first_digit)));
str[write_pos++] = static_cast<char>(
std::toupper(static_cast<unsigned char>(second_digit)));
read_pos += 3;
}
// Decode all percent-encoded sequences
// Internal storage is always fully decoded
str[write_pos++] = static_cast<char>(value);
read_pos += 3;
} else {
str[write_pos++] = str[read_pos++];
}
Expand All @@ -191,14 +175,11 @@ inline auto uri_unescape_selective_inplace(std::string &str,
str.resize(write_pos);
}

// Selective unescaping for URI normalization (copy version for compatibility)
// Only unescapes unreserved characters, keeps reserved characters encoded
// but normalizes hex digits to uppercase
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
inline auto uri_unescape_selective(std::string_view input,
bool allow_colon_at = false) -> std::string {
// Full unescaping for URI normalization (copy version for compatibility)
// Decodes all percent-encoded sequences
inline auto uri_unescape_selective(std::string_view input) -> std::string {
std::string result{input};
uri_unescape_selective_inplace(result, allow_colon_at);
uri_unescape_selective_inplace(result);
return result;
}

Expand Down
26 changes: 7 additions & 19 deletions src/core/uri/filesystem.cc
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
#include <sourcemeta/core/uri.h>

#include "escaping.h"

#include <algorithm> // std::ranges::replace
#include <filesystem> // std::filesystem
#include <iterator> // std::advance, std::next
#include <sstream> // std::istringstream, std::ostringstream
#include <string> // std::string

namespace sourcemeta::core {
Expand All @@ -28,11 +25,8 @@ auto URI::to_path() const -> std::filesystem::path {
std::ranges::replace(path, '/', '\\');
}

// Unescape percent-encoded characters
std::istringstream input{path};
std::ostringstream output;
uri_unescape(input, output);
return output.str();
// Path is already fully decoded, just return it
return path;
}

auto URI::from_path(const std::filesystem::path &path) -> URI {
Expand Down Expand Up @@ -60,10 +54,7 @@ auto URI::from_path(const std::filesystem::path &path) -> URI {

// For UNC paths, the first segment is the hostname
if (is_unc) {
std::istringstream input{iterator->string()};
std::ostringstream output;
uri_escape(input, output, URIEscapeMode::Filesystem);
result.host_ = output.str();
result.host_ = iterator->string();
std::advance(iterator, 1);
}

Expand All @@ -76,17 +67,14 @@ auto URI::from_path(const std::filesystem::path &path) -> URI {
result.append_path("/");
}
} else {
// Escape the segment
std::istringstream input{iterator->string()};
std::ostringstream output;
uri_escape(input, output, URIEscapeMode::Filesystem);
const auto escaped_segment = output.str();
// Store raw segment - escaping will happen during recompose()
const auto segment = iterator->string();

if (result.path_.has_value()) {
result.append_path(escaped_segment);
result.append_path(segment);
} else {
// First segment: file:// URIs need leading slash
result.path_ = "/" + escaped_segment;
result.path_ = "/" + segment;
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/core/uri/parse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,10 @@ auto URI::parse(const std::string &input) -> void {
parse_authority(input, position, this->userinfo_, this->host_, this->port_);
}

const auto is_urn_or_tag = this->is_urn() || this->is_tag();
auto path = parse_path(input, position);

if (path.has_value()) {
uri_unescape_selective_inplace(path.value(), is_urn_or_tag);
uri_unescape_selective_inplace(path.value());
this->path_ = std::move(path.value());
} else if (has_authority || this->scheme_.has_value()) {
if (input.ends_with(URI_SLASH) || input == "/") {
Expand Down
35 changes: 25 additions & 10 deletions src/core/uri/recompose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,23 @@

namespace sourcemeta::core {

namespace {

auto escape_component(std::string_view input, URIEscapeMode mode)
-> std::string {
std::istringstream in{std::string{input}};
std::ostringstream out;
uri_escape(in, out, mode, false);
return out.str();
}

} // namespace

auto URI::recompose() const -> std::string {
const auto uri = this->recompose_without_fragment();

// Fragment
const auto result_fragment = this->fragment();
if (!result_fragment.has_value()) {
if (!this->fragment_.has_value()) {
return uri.value_or("");
}

Expand All @@ -25,9 +36,10 @@ auto URI::recompose() const -> std::string {

result << '#';

// Escape fragment using stream-based escaping with lookahead
std::istringstream fragment_input{std::string{result_fragment.value()}};
uri_escape(fragment_input, result, URIEscapeMode::Fragment);
// Escape fragment using stream-based escaping
// Don't preserve percent sequences since internal storage is fully decoded
std::istringstream fragment_input{std::string{this->fragment_.value()}};
uri_escape(fragment_input, result, URIEscapeMode::Fragment, false);

return result.str();
}
Expand All @@ -54,7 +66,8 @@ auto URI::recompose_without_fragment() const -> std::optional<std::string> {
}

if (user_info.has_value()) {
result << user_info.value() << "@";
result << escape_component(user_info.value(), URIEscapeMode::Fragment)
<< "@";
}

// Host
Expand All @@ -67,7 +80,8 @@ auto URI::recompose_without_fragment() const -> std::optional<std::string> {
// https://tools.ietf.org/html/rfc2732#section-2
result << '[' << result_host.value() << ']';
} else {
result << result_host.value();
result << escape_component(result_host.value(),
URIEscapeMode::SkipSubDelims);
}
}

Expand All @@ -87,16 +101,17 @@ auto URI::recompose_without_fragment() const -> std::optional<std::string> {
// "h" not "/h")
if (result_scheme.has_value() && !has_authority &&
path_value.starts_with("/") && !path_value.starts_with("//")) {
result << path_value.substr(1);
result << escape_component(path_value.substr(1), URIEscapeMode::Fragment);
} else {
result << path_value;
result << escape_component(path_value, URIEscapeMode::Fragment);
}
}

// Query
const auto result_query{this->query()};
if (result_query.has_value()) {
result << '?' << result_query.value();
result << '?'
<< escape_component(result_query.value(), URIEscapeMode::Fragment);
}

if (result.tellp() == 0) {
Expand Down
7 changes: 2 additions & 5 deletions src/core/uri/setters.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,8 @@ auto normalize_fragment(std::string_view input) -> std::string {
return "";
}

if (input.starts_with('#')) {
return std::string{input.substr(1)};
}

return std::string{input};
// Strip leading '#' and store raw value
return std::string{input.starts_with('#') ? input.substr(1) : input};
}

} // namespace
Expand Down
7 changes: 7 additions & 0 deletions test/jsonpointer/jsonpointer_to_uri_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,13 @@ TEST(JSONPointer_to_uri, with_absolute_base) {
EXPECT_EQ(fragment.recompose(), "https://www.example.com#/foo/bar");
}

TEST(JSONPointer_to_uri, with_absolute_base_percentage) {
const sourcemeta::core::Pointer pointer{"foo%bar"};
const sourcemeta::core::URI base{"https://www.example.com"};
const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer, base)};
EXPECT_EQ(fragment.recompose(), "https://www.example.com#/foo%25bar");
}

TEST(JSONPointer_to_uri, with_relative_base) {
const sourcemeta::core::Pointer pointer{"foo", "bar"};
const sourcemeta::core::URI base{"../baz"};
Expand Down
52 changes: 50 additions & 2 deletions test/uri/uri_fragment_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ TEST(URI_fragment, https_with_empty_fragment) {
TEST(URI_fragment, https_with_escaped_jsonpointer) {
const sourcemeta::core::URI uri{"https://example.com/#/c%25d"};
EXPECT_TRUE(uri.fragment().has_value());
EXPECT_EQ(uri.fragment().value(), "/c%25d");
EXPECT_EQ(uri.fragment().value(), "/c%d");
}

TEST(URI_fragment, invalid_fragment_with_pointer) {
Expand Down Expand Up @@ -164,7 +164,55 @@ TEST(URI_fragment, set_pointer_at) {
TEST(URI_fragment, set_pointer_bracket) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/[foo/bar");
// Escaping should only happen during recomposing
EXPECT_EQ(uri.fragment(), "/[foo/bar");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/%5Bfoo/bar");
}

TEST(URI_fragment, set_pointer_percentage) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/foo%bar");
EXPECT_EQ(uri.fragment(), "/foo%bar");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%25bar");
}

TEST(URI_fragment, set_percentage_at_end) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/foo%");
EXPECT_EQ(uri.fragment(), "/foo%");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%25");
}

TEST(URI_fragment, set_percentage_with_one_hex) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/foo%2");
EXPECT_EQ(uri.fragment(), "/foo%2");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%252");
}

TEST(URI_fragment, set_multiple_percentages) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/foo%%bar");
EXPECT_EQ(uri.fragment(), "/foo%%bar");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%25%25bar");
}

TEST(URI_fragment, set_percentage_followed_by_valid_hex_sequence) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/foo%2Fbar");
EXPECT_EQ(uri.fragment(), "/foo%2Fbar");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%252Fbar");
}

TEST(URI_fragment, set_space_character) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/foo bar");
EXPECT_EQ(uri.fragment(), "/foo bar");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%20bar");
}

TEST(URI_fragment, set_non_ascii_character) {
sourcemeta::core::URI uri{"https://www.sourcemeta.com"};
uri.fragment("/foo\xC3\xA9");
EXPECT_EQ(uri.fragment(), "/foo\xC3\xA9");
EXPECT_EQ(uri.recompose(), "https://www.sourcemeta.com#/foo%C3%A9");
}
2 changes: 1 addition & 1 deletion test/uri/uri_from_path_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ TEST(URI_from_path, unix_absolute) {
TEST(URI_from_path, unix_with_space_and_reserved) {
const std::filesystem::path example{"/foo/My Folder/has#hash?value%"};
const auto uri{sourcemeta::core::URI::from_path(example)};
EXPECT_EQ(uri.recompose(), "file:///foo/My%20Folder/has%23hash%3Fvalue%25");
EXPECT_EQ(uri.recompose(), "file:///foo/My%20Folder/has%23hash?value%25");
}

TEST(URI_from_path, unix_trailing_slash) {
Expand Down
Loading