Skip to content

Commit 9694db6

Browse files
committed
fix expected NaN values in both stats and page index
1 parent 07a4d77 commit 9694db6

8 files changed

Lines changed: 143 additions & 30 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,21 @@ public void updateStats(Binary value) {
7272
this.markAsNotEmpty();
7373
} else {
7474
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
75-
if (!Float16.isNaN(value.get2BytesLittleEndian())) {
76-
if (Float16.isNaN(min.get2BytesLittleEndian())
77-
|| comparator().compare(min, value) > 0) {
75+
boolean valueIsNaN = Float16.isNaN(value.get2BytesLittleEndian());
76+
boolean minIsNaN = Float16.isNaN(min.get2BytesLittleEndian());
77+
boolean maxIsNaN = Float16.isNaN(max.get2BytesLittleEndian());
78+
if (valueIsNaN) {
79+
if (minIsNaN && comparator().compare(min, value) > 0) {
7880
min = value.copy();
7981
}
80-
if (Float16.isNaN(max.get2BytesLittleEndian())
81-
|| comparator().compare(max, value) < 0) {
82+
if (maxIsNaN && comparator().compare(max, value) < 0) {
83+
max = value.copy();
84+
}
85+
} else {
86+
if (minIsNaN || comparator().compare(min, value) > 0) {
87+
min = value.copy();
88+
}
89+
if (maxIsNaN || comparator().compare(max, value) < 0) {
8290
max = value.copy();
8391
}
8492
}
@@ -156,15 +164,24 @@ public boolean isSmallerThanWithTruncation(long size, int truncationLength) {
156164
@Deprecated
157165
public void updateStats(Binary min_value, Binary max_value) {
158166
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
159-
if (!Float16.isNaN(min_value.get2BytesLittleEndian())) {
160-
if (Float16.isNaN(min.get2BytesLittleEndian()) || comparator().compare(min, min_value) > 0) {
167+
boolean minValueIsNaN = Float16.isNaN(min_value.get2BytesLittleEndian());
168+
boolean minIsNaN = Float16.isNaN(min.get2BytesLittleEndian());
169+
if (minValueIsNaN) {
170+
if (minIsNaN && comparator().compare(min, min_value) > 0) {
161171
min = min_value.copy();
162172
}
173+
} else if (minIsNaN || comparator().compare(min, min_value) > 0) {
174+
min = min_value.copy();
163175
}
164-
if (!Float16.isNaN(max_value.get2BytesLittleEndian())) {
165-
if (Float16.isNaN(max.get2BytesLittleEndian()) || comparator().compare(max, max_value) < 0) {
176+
177+
boolean maxValueIsNaN = Float16.isNaN(max_value.get2BytesLittleEndian());
178+
boolean maxIsNaN = Float16.isNaN(max.get2BytesLittleEndian());
179+
if (maxValueIsNaN) {
180+
if (maxIsNaN && comparator().compare(max, max_value) < 0) {
166181
max = max_value.copy();
167182
}
183+
} else if (maxIsNaN || comparator().compare(max, max_value) < 0) {
184+
max = max_value.copy();
168185
}
169186
return;
170187
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,20 @@ public boolean isSmallerThan(long size) {
105105

106106
public void updateStats(double min_value, double max_value) {
107107
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
108-
if (!Double.isNaN(min_value)) {
109-
if (Double.isNaN(min) || comparator().compare(min, min_value) > 0) {
108+
if (Double.isNaN(min_value)) {
109+
if (Double.isNaN(min) && comparator().compare(min, min_value) > 0) {
110110
min = min_value;
111111
}
112+
} else if (Double.isNaN(min) || comparator().compare(min, min_value) > 0) {
113+
min = min_value;
112114
}
113-
if (!Double.isNaN(max_value)) {
114-
if (Double.isNaN(max) || comparator().compare(max, max_value) < 0) {
115+
116+
if (Double.isNaN(max_value)) {
117+
if (Double.isNaN(max) && comparator().compare(max, max_value) < 0) {
115118
max = max_value;
116119
}
120+
} else if (Double.isNaN(max) || comparator().compare(max, max_value) < 0) {
121+
max = max_value;
117122
}
118123
return;
119124
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,20 @@ public boolean isSmallerThan(long size) {
106106

107107
public void updateStats(float min_value, float max_value) {
108108
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
109-
if (!Float.isNaN(min_value)) {
110-
if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
109+
if (Float.isNaN(min_value)) {
110+
if (Float.isNaN(min) && comparator().compare(min, min_value) > 0) {
111111
min = min_value;
112112
}
113+
} else if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
114+
min = min_value;
113115
}
114-
if (!Float.isNaN(max_value)) {
115-
if (Float.isNaN(max) || comparator().compare(max, max_value) < 0) {
116+
117+
if (Float.isNaN(max_value)) {
118+
if (Float.isNaN(max) && comparator().compare(max, max_value) < 0) {
116119
max = max_value;
117120
}
121+
} else if (Float.isNaN(max) || comparator().compare(max, max_value) < 0) {
122+
max = max_value;
118123
}
119124
return;
120125
}

parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ int compareValueToMax(int arrayIndex) {
108108
}
109109
}
110110

111-
private static final Binary FLOAT16_NAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x7e});
112111
private final List<Binary> minValues = new ArrayList<>();
113112
private final List<Binary> maxValues = new ArrayList<>();
114113
private final BinaryTruncator truncator;
@@ -153,10 +152,7 @@ void addMinMax(Object min, Object max) {
153152
short sMax = bMax.get2BytesLittleEndian();
154153

155154
if (Float16.isNaN(sMin) || Float16.isNaN(sMax)) {
156-
if (isIeee754TotalOrder) {
157-
bMin = FLOAT16_NAN;
158-
bMax = FLOAT16_NAN;
159-
} else {
155+
if (!isIeee754TotalOrder) {
160156
invalid = true;
161157
}
162158
}

parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,7 @@ void addMinMax(Object min, Object max) {
131131
double dMin = (double) min;
132132
double dMax = (double) max;
133133
if (Double.isNaN(dMin) || Double.isNaN(dMax)) {
134-
if (isIeee754TotalOrder) {
135-
dMin = Double.NaN;
136-
dMax = Double.NaN;
137-
} else {
134+
if (!isIeee754TotalOrder) {
138135
invalid = true;
139136
}
140137
}

parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,7 @@ void addMinMax(Object min, Object max) {
131131
float fMin = (float) min;
132132
float fMax = (float) max;
133133
if (Float.isNaN(fMin) || Float.isNaN(fMax)) {
134-
if (isIeee754TotalOrder) {
135-
fMin = Float.NaN;
136-
fMax = Float.NaN;
137-
} else {
134+
if (!isIeee754TotalOrder) {
138135
invalid = true;
139136
}
140137
}

parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatisticsNanCount.java

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ private static Binary float16Binary(short h) {
6060
}
6161

6262
private static final Binary FLOAT16_NAN = float16Binary((short) 0x7e00);
63+
private static final Binary FLOAT16_NAN_SMALL = float16Binary((short) 0x7c01);
64+
private static final Binary FLOAT16_NAN_LARGE = float16Binary((short) 0x7fff);
6365
private static final Binary FLOAT16_ONE = float16Binary((short) 0x3C00);
6466
private static final Binary FLOAT16_TWO = float16Binary((short) 0x4000);
6567

@@ -239,6 +241,45 @@ public void testFloat16IEEE754NanOnlySetHasNonNullValue() {
239241
assertTrue(Float16.isNaN(stats.genericGetMax().get2BytesLittleEndian()));
240242
}
241243

244+
@Test
245+
public void testFloatIEEE754AllNaNTracksNaNRange() {
246+
FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_IEEE754_TYPE);
247+
float minNaN = Float.intBitsToFloat(0x7fc00001);
248+
float maxNaN = Float.intBitsToFloat(0x7fffffff);
249+
stats.updateStats(maxNaN);
250+
stats.updateStats(minNaN);
251+
252+
assertEquals(2, stats.getNanCount());
253+
assertEquals(0x7fc00001, Float.floatToRawIntBits(stats.getMin()));
254+
assertEquals(0x7fffffff, Float.floatToRawIntBits(stats.getMax()));
255+
}
256+
257+
@Test
258+
public void testDoubleIEEE754AllNaNTracksNaNRange() {
259+
DoubleStatistics stats = (DoubleStatistics) Statistics.createStats(DOUBLE_IEEE754_TYPE);
260+
double minNaN = Double.longBitsToDouble(0x7ff0000000000001L);
261+
double maxNaN = Double.longBitsToDouble(0x7fffffffffffffffL);
262+
stats.updateStats(maxNaN);
263+
stats.updateStats(minNaN);
264+
265+
assertEquals(2, stats.getNanCount());
266+
assertEquals(0x7ff0000000000001L, Double.doubleToRawLongBits(stats.getMin()));
267+
assertEquals(0x7fffffffffffffffL, Double.doubleToRawLongBits(stats.getMax()));
268+
}
269+
270+
@Test
271+
public void testFloat16IEEE754AllNaNTracksNaNRange() {
272+
BinaryStatistics stats = (BinaryStatistics) Statistics.createStats(FLOAT16_IEEE754_TYPE);
273+
stats.updateStats(FLOAT16_NAN_LARGE);
274+
stats.updateStats(FLOAT16_NAN_SMALL);
275+
276+
assertEquals(2, stats.getNanCount());
277+
assertEquals(
278+
FLOAT16_NAN_SMALL.get2BytesLittleEndian(), stats.genericGetMin().get2BytesLittleEndian());
279+
assertEquals(
280+
FLOAT16_NAN_LARGE.get2BytesLittleEndian(), stats.genericGetMax().get2BytesLittleEndian());
281+
}
282+
242283
@Test
243284
public void testFloatIEEE754NanExcludedFromMax() {
244285
FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_IEEE754_TYPE);

parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilderNaN.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import static org.junit.Assert.assertNotNull;
2323
import static org.junit.Assert.assertNull;
2424

25+
import java.nio.ByteOrder;
2526
import java.util.ArrayList;
2627
import java.util.HashSet;
2728
import java.util.List;
@@ -73,6 +74,8 @@ private static Binary float16Binary(short h) {
7374
}
7475

7576
private static final Binary FLOAT16_NAN = float16Binary((short) 0x7e00);
77+
private static final Binary FLOAT16_NAN_SMALL = float16Binary((short) 0x7c01);
78+
private static final Binary FLOAT16_NAN_LARGE = float16Binary((short) 0x7fff);
7679
private static final Binary FLOAT16_ONE = float16Binary((short) 0x3C00); // 1.0
7780
private static final Binary FLOAT16_TWO = float16Binary((short) 0x4000); // 2.0
7881
private static final Binary FLOAT16_THREE = float16Binary((short) 0x4200); // 3.0
@@ -172,6 +175,58 @@ public void testFloat16Ieee754BuildNanCounts() {
172175
assertEquals(List.of(1L, 2L, 0L), ci.getNanCounts());
173176
}
174177

178+
@Test
179+
public void testFloatIeee754AllNanPageKeepsNanRangeInColumnIndex() {
180+
float minNaN = Float.intBitsToFloat(0x7fc00001);
181+
float maxNaN = Float.intBitsToFloat(0x7fffffff);
182+
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT_IEEE754_TYPE, Integer.MAX_VALUE);
183+
builder.add(floatStats(FLOAT_IEEE754_TYPE, maxNaN, minNaN));
184+
185+
ColumnIndex ci = builder.build();
186+
assertNotNull(ci);
187+
assertEquals(1, ci.getMinValues().size());
188+
assertEquals(
189+
0x7fc00001,
190+
ci.getMinValues().get(0).order(ByteOrder.LITTLE_ENDIAN).getInt(0));
191+
assertEquals(
192+
0x7fffffff,
193+
ci.getMaxValues().get(0).order(ByteOrder.LITTLE_ENDIAN).getInt(0));
194+
}
195+
196+
@Test
197+
public void testDoubleIeee754AllNanPageKeepsNanRangeInColumnIndex() {
198+
double minNaN = Double.longBitsToDouble(0x7ff0000000000001L);
199+
double maxNaN = Double.longBitsToDouble(0x7fffffffffffffffL);
200+
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(DOUBLE_IEEE754_TYPE, Integer.MAX_VALUE);
201+
builder.add(doubleStats(DOUBLE_IEEE754_TYPE, maxNaN, minNaN));
202+
203+
ColumnIndex ci = builder.build();
204+
assertNotNull(ci);
205+
assertEquals(1, ci.getMinValues().size());
206+
assertEquals(
207+
0x7ff0000000000001L,
208+
ci.getMinValues().get(0).order(ByteOrder.LITTLE_ENDIAN).getLong(0));
209+
assertEquals(
210+
0x7fffffffffffffffL,
211+
ci.getMaxValues().get(0).order(ByteOrder.LITTLE_ENDIAN).getLong(0));
212+
}
213+
214+
@Test
215+
public void testFloat16Ieee754AllNanPageKeepsNanRangeInColumnIndex() {
216+
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT16_IEEE754_TYPE, Integer.MAX_VALUE);
217+
builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_NAN_LARGE, FLOAT16_NAN_SMALL));
218+
219+
ColumnIndex ci = builder.build();
220+
assertNotNull(ci);
221+
assertEquals(1, ci.getMinValues().size());
222+
assertEquals(
223+
FLOAT16_NAN_SMALL.get2BytesLittleEndian(),
224+
ci.getMinValues().get(0).order(ByteOrder.LITTLE_ENDIAN).getShort(0));
225+
assertEquals(
226+
FLOAT16_NAN_LARGE.get2BytesLittleEndian(),
227+
ci.getMaxValues().get(0).order(ByteOrder.LITTLE_ENDIAN).getShort(0));
228+
}
229+
175230
// Column index filtering for float
176231

177232
@Test

0 commit comments

Comments
 (0)