Skip to content

Commit 83b67ce

Browse files
committed
Add VarHandle-based StringEncoder
1 parent 1e763b2 commit 83b67ce

File tree

19 files changed

+899
-495
lines changed

19 files changed

+899
-495
lines changed

exporters/common/build.gradle.kts

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,46 @@ plugins {
88
description = "OpenTelemetry Exporter Common"
99
otelJava.moduleName.set("io.opentelemetry.exporter.internal")
1010

11+
java {
12+
sourceSets {
13+
create("java9") {
14+
java {
15+
srcDir("src/main/java9")
16+
}
17+
// Make java9 source set depend on main source set
18+
// since VarHandleStringEncoder implements StringEncoder from the main source set
19+
compileClasspath += sourceSets.main.get().output + sourceSets.main.get().compileClasspath
20+
}
21+
}
22+
}
23+
24+
// Configure java9 compilation to see main source classes
25+
sourceSets.named("java9") {
26+
compileClasspath += sourceSets.main.get().output
27+
}
28+
29+
tasks.named<JavaCompile>("compileJava9Java") {
30+
options.release.set(9)
31+
}
32+
33+
tasks.named<Jar>("jar") {
34+
manifest {
35+
attributes["Multi-Release"] = "true"
36+
}
37+
from(sourceSets.named("java9").get().output) {
38+
into("META-INF/versions/9")
39+
}
40+
}
41+
42+
// Configure test to include java9 classes when running on Java 9+
43+
// so that StringEncoderHolder.createUnsafeEncoder() can instantiate the Java 9 version
44+
val javaVersion = JavaVersion.current()
45+
if (javaVersion >= JavaVersion.VERSION_1_9) {
46+
sourceSets.named("test") {
47+
runtimeClasspath += sourceSets.named("java9").get().output
48+
}
49+
}
50+
1151
val versions: Map<String, String> by project
1252
dependencies {
1353
api(project(":api:all"))
@@ -79,6 +119,15 @@ tasks {
79119
check {
80120
dependsOn(testing.suites)
81121
}
122+
123+
withType<Test> {
124+
// Allow VarHandle access to String internals
125+
// generally users won't do this and so won't get the VarHandle implementation
126+
// but the Java agent is able to automatically open these modules
127+
// (see ModuleOpener.java in that repository)
128+
jvmArgs("--add-opens=java.base/java.lang=ALL-UNNAMED")
129+
jvmArgs("-XX:+IgnoreUnrecognizedVMOptions") // needed for Java 8
130+
}
82131
}
83132

84133
afterEvaluate {
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.exporter.internal.marshal;
7+
8+
import java.io.IOException;
9+
10+
/**
11+
* This class contains shared logic for UTF-8 encoding operations while allowing subclasses to
12+
* implement different mechanisms for accessing String internal byte arrays (e.g., Unsafe vs
13+
* VarHandle).
14+
*
15+
* <p>This class is internal and is hence not for public use. Its APIs are unstable and can change
16+
* at any time.
17+
*/
18+
abstract class AbstractStringEncoder implements StringEncoder {
19+
20+
private final FallbackStringEncoder fallback = new FallbackStringEncoder();
21+
22+
@Override
23+
public final void writeUtf8(CodedOutputStream output, String string, int utf8Length)
24+
throws IOException {
25+
// if the length of the latin1 string and the utf8 output are the same then the string must be
26+
// composed of only 7bit characters and can be directly copied to the output
27+
if (string.length() == utf8Length && isLatin1(string)) {
28+
byte[] bytes = getStringBytes(string);
29+
output.write(bytes, 0, bytes.length);
30+
} else {
31+
fallback.writeUtf8(output, string, utf8Length);
32+
}
33+
}
34+
35+
@Override
36+
public final int getUtf8Size(String string) {
37+
if (isLatin1(string)) {
38+
byte[] bytes = getStringBytes(string);
39+
// latin1 bytes with negative value (most significant bit set) are encoded as 2 bytes in utf8
40+
return string.length() + countNegative(bytes);
41+
}
42+
43+
return fallback.getUtf8Size(string);
44+
}
45+
46+
protected abstract byte[] getStringBytes(String string);
47+
48+
protected abstract boolean isLatin1(String string);
49+
50+
protected abstract long getLong(byte[] bytes, int offset);
51+
52+
// Inner loop can process at most 8 * 255 bytes without overflowing counter. To process more bytes
53+
// inner loop has to be run multiple times.
54+
private static final int MAX_INNER_LOOP_SIZE = 8 * 255;
55+
// mask that selects only the most significant bit in every byte of the long
56+
private static final long MOST_SIGNIFICANT_BIT_MASK = 0x8080808080808080L;
57+
58+
/** Returns the count of bytes with negative value. */
59+
private int countNegative(byte[] bytes) {
60+
int count = 0;
61+
int offset = 0;
62+
// We are processing one long (8 bytes) at a time. In the inner loop we are keeping counts in a
63+
// long where each byte in the long is a separate counter. Due to this the inner loop can
64+
// process a maximum of 8*255 bytes at a time without overflow.
65+
for (int i = 1; i <= bytes.length / MAX_INNER_LOOP_SIZE + 1; i++) {
66+
long tmp = 0; // each byte in this long is a separate counter
67+
int limit = Math.min(i * MAX_INNER_LOOP_SIZE, bytes.length & ~7);
68+
for (; offset < limit; offset += 8) {
69+
long value = getLong(bytes, offset);
70+
// Mask the value keeping only the most significant bit in each byte and then shift this bit
71+
// to the position of the least significant bit in each byte. If the input byte was not
72+
// negative then after this transformation it will be zero, if it was negative then it will
73+
// be one.
74+
tmp += (value & MOST_SIGNIFICANT_BIT_MASK) >>> 7;
75+
}
76+
// sum up counts
77+
if (tmp != 0) {
78+
for (int j = 0; j < 8; j++) {
79+
count += (int) (tmp & 0xff);
80+
tmp = tmp >>> 8;
81+
}
82+
}
83+
}
84+
85+
// Handle remaining bytes. Previous loop processes 8 bytes a time, if the input size is not
86+
// divisible with 8 the remaining bytes are handled here.
87+
for (int i = offset; i < bytes.length; i++) {
88+
// same as if (bytes[i] < 0) count++;
89+
count += bytes[i] >>> 31;
90+
}
91+
return count;
92+
}
93+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.exporter.internal.marshal;
7+
8+
import java.io.IOException;
9+
10+
/**
11+
* Fallback StringEncoder implementation using standard Java string operations.
12+
*
13+
* <p>This implementation works on all Java versions and provides correct UTF-8 handling.
14+
*
15+
* <p>This class is internal and is hence not for public use. Its APIs are unstable and can change
16+
* at any time.
17+
*/
18+
final class FallbackStringEncoder implements StringEncoder {
19+
20+
FallbackStringEncoder() {}
21+
22+
@Override
23+
public int getUtf8Size(String string) {
24+
return encodedUtf8Length(string);
25+
}
26+
27+
@Override
28+
public void writeUtf8(CodedOutputStream output, String string, int utf8Length)
29+
throws IOException {
30+
encodeUtf8(output, string);
31+
}
32+
33+
// adapted from
34+
// https://github.com/protocolbuffers/protobuf/blob/b618f6750aed641a23d5f26fbbaf654668846d24/java/core/src/main/java/com/google/protobuf/Utf8.java#L217
35+
private static int encodedUtf8Length(String string) {
36+
// Warning to maintainers: this implementation is highly optimized.
37+
int utf16Length = string.length();
38+
int utf8Length = utf16Length;
39+
int i = 0;
40+
41+
// This loop optimizes for pure ASCII.
42+
while (i < utf16Length && string.charAt(i) < 0x80) {
43+
i++;
44+
}
45+
46+
// This loop optimizes for chars less than 0x800.
47+
for (; i < utf16Length; i++) {
48+
char c = string.charAt(i);
49+
if (c < 0x800) {
50+
utf8Length += ((0x7f - c) >>> 31); // branch free!
51+
} else {
52+
utf8Length += encodedUtf8LengthGeneral(string, i);
53+
break;
54+
}
55+
}
56+
57+
if (utf8Length < utf16Length) {
58+
// Necessary and sufficient condition for overflow because of maximum 3x expansion
59+
throw new IllegalArgumentException(
60+
"UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
61+
}
62+
63+
return utf8Length;
64+
}
65+
66+
// adapted from
67+
// https://github.com/protocolbuffers/protobuf/blob/b618f6750aed641a23d5f26fbbaf654668846d24/java/core/src/main/java/com/google/protobuf/Utf8.java#L247
68+
private static int encodedUtf8LengthGeneral(String string, int start) {
69+
int utf16Length = string.length();
70+
int utf8Length = 0;
71+
for (int i = start; i < utf16Length; i++) {
72+
char c = string.charAt(i);
73+
if (c < 0x800) {
74+
utf8Length += (0x7f - c) >>> 31; // branch free!
75+
} else {
76+
utf8Length += 2;
77+
if (Character.isSurrogate(c)) {
78+
// Check that we have a well-formed surrogate pair.
79+
if (Character.codePointAt(string, i) != c) {
80+
i++;
81+
} else {
82+
// invalid sequence
83+
// At this point we have accumulated 3 byes of length (2 in this method and 1 in caller)
84+
// for current character, reduce the length to 1 bytes as we are going to encode the
85+
// invalid character as ?
86+
utf8Length -= 2;
87+
}
88+
}
89+
}
90+
}
91+
92+
return utf8Length;
93+
}
94+
95+
// encode utf8 the same way as length is computed in encodedUtf8Length
96+
// adapted from
97+
// https://github.com/protocolbuffers/protobuf/blob/b618f6750aed641a23d5f26fbbaf654668846d24/java/core/src/main/java/com/google/protobuf/Utf8.java#L1016
98+
private static void encodeUtf8(CodedOutputStream output, String in) throws IOException {
99+
int utf16Length = in.length();
100+
int i = 0;
101+
// Designed to take advantage of
102+
// https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
103+
for (char c; i < utf16Length && (c = in.charAt(i)) < 0x80; i++) {
104+
output.write((byte) c);
105+
}
106+
if (i == utf16Length) {
107+
return;
108+
}
109+
110+
for (char c; i < utf16Length; i++) {
111+
c = in.charAt(i);
112+
if (c < 0x80) {
113+
// 1 byte, 7 bits
114+
output.write((byte) c);
115+
} else if (c < 0x800) { // 11 bits, two UTF-8 bytes
116+
output.write((byte) ((0xF << 6) | (c >>> 6)));
117+
output.write((byte) (0x80 | (0x3F & c)));
118+
} else if (!Character.isSurrogate(c)) {
119+
// Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
120+
output.write((byte) ((0xF << 5) | (c >>> 12)));
121+
output.write((byte) (0x80 | (0x3F & (c >>> 6))));
122+
output.write((byte) (0x80 | (0x3F & c)));
123+
} else {
124+
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
125+
// four UTF-8 bytes
126+
int codePoint = Character.codePointAt(in, i);
127+
if (codePoint != c) {
128+
output.write((byte) ((0xF << 4) | (codePoint >>> 18)));
129+
output.write((byte) (0x80 | (0x3F & (codePoint >>> 12))));
130+
output.write((byte) (0x80 | (0x3F & (codePoint >>> 6))));
131+
output.write((byte) (0x80 | (0x3F & codePoint)));
132+
i++;
133+
} else {
134+
// invalid sequence
135+
output.write((byte) '?');
136+
}
137+
}
138+
}
139+
}
140+
}

exporters/common/src/main/java/io/opentelemetry/exporter/internal/marshal/MarshalerContext.java

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
*/
2828
public final class MarshalerContext {
2929
private final boolean marshalStringNoAllocation;
30-
private final boolean marshalStringUnsafe;
30+
private final StringEncoder stringEncoder;
3131

3232
private int[] sizes = new int[16];
3333
private int sizeReadIndex;
@@ -37,20 +37,25 @@ public final class MarshalerContext {
3737
private int dataWriteIndex;
3838

3939
public MarshalerContext() {
40-
this(/* marshalStringNoAllocation= */ true, /* marshalStringUnsafe= */ true);
40+
this(/* marshalStringNoAllocation= */ true);
4141
}
4242

43-
public MarshalerContext(boolean marshalStringNoAllocation, boolean marshalStringUnsafe) {
43+
public MarshalerContext(boolean marshalStringNoAllocation) {
4444
this.marshalStringNoAllocation = marshalStringNoAllocation;
45-
this.marshalStringUnsafe = marshalStringUnsafe;
45+
this.stringEncoder = StringEncoder.getInstance();
46+
}
47+
48+
public MarshalerContext(boolean marshalStringNoAllocation, StringEncoder stringEncoder) {
49+
this.marshalStringNoAllocation = marshalStringNoAllocation;
50+
this.stringEncoder = stringEncoder;
4651
}
4752

4853
public boolean marshalStringNoAllocation() {
4954
return marshalStringNoAllocation;
5055
}
5156

52-
public boolean marshalStringUnsafe() {
53-
return marshalStringUnsafe;
57+
public StringEncoder getStringEncoder() {
58+
return stringEncoder;
5459
}
5560

5661
public void addSize(int size) {

exporters/common/src/main/java/io/opentelemetry/exporter/internal/marshal/ProtoSerializer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ public void writeString(
160160
output.writeUInt32NoTag(field.getTag());
161161
output.writeUInt32NoTag(utf8Length);
162162

163-
StatelessMarshalerUtil.writeUtf8(output, string, utf8Length, context);
163+
context.getStringEncoder().writeUtf8(output, string, utf8Length);
164164
}
165165

166166
@Override

0 commit comments

Comments
 (0)