jiayuasu commented on code in PR #2776:
URL: https://github.com/apache/sedona/pull/2776#discussion_r2979672285
##########
spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java:
##########
@@ -63,7 +82,39 @@ private OsmNode parse(int idx, Osmformat.StringTable
stringTable) {
HashMap<String, String> tags = parseTags(stringTable);
- return new OsmNode(id, lat, lon, tags);
+ OsmNode node = new OsmNode(id, lat, lon, tags);
+
+ if (hasDenseInfo) {
+ Osmformat.DenseInfo denseInfo = nodes.getDenseinfo();
+
+ // version is NOT delta-encoded
+ node.setVersion(denseInfo.getVersion(idx));
+
+ // timestamp, changeset, uid, user_sid are delta-encoded
+ long timestamp = denseInfo.getTimestamp(idx) + firstTimestamp;
+ long changeset = denseInfo.getChangeset(idx) + firstChangeset;
+ int uid = denseInfo.getUid(idx) + firstUid;
+ int userSid = denseInfo.getUserSid(idx) + firstUserSid;
+
+ firstTimestamp = timestamp;
+ firstChangeset = changeset;
+ firstUid = uid;
+ firstUserSid = userSid;
+
+ node.setTimestamp(timestamp * dateGranularity);
+ node.setChangeset(changeset);
+ node.setUid(uid);
+ if (userSid > 0) {
+ node.setUser(stringTable.getS(userSid).toStringUtf8());
+ }
+
Review Comment:
Fixed. Each field access is now guarded with `get*Count() > idx` and delta
accumulators are only advanced when the field is present.
##########
spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/features/InfoResolver.java:
##########
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sedona.sql.datasources.osmpbf.features;
+
+import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
+import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity;
+
+public class InfoResolver {
+
+ public static void populateInfo(
+ OSMEntity entity,
+ Osmformat.Info info,
+ Osmformat.StringTable stringTable,
+ int dateGranularity) {
+ if (info == null) {
+ return;
+ }
+ entity.setVersion(info.getVersion());
+ entity.setTimestamp((long) info.getTimestamp() * dateGranularity);
+ entity.setChangeset(info.getChangeset());
+ entity.setUid(info.getUid());
+ if (info.getUserSid() > 0) {
+ entity.setUser(stringTable.getS(info.getUserSid()).toStringUtf8());
+ }
+ if (info.hasVisible()) {
+ entity.setVisible(info.getVisible());
Review Comment:
Fixed. `visible` now defaults to `true` when absent, in both `InfoResolver`
and `DenseNodeExtractor`.
##########
spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java:
##########
@@ -63,7 +82,39 @@ private OsmNode parse(int idx, Osmformat.StringTable
stringTable) {
HashMap<String, String> tags = parseTags(stringTable);
- return new OsmNode(id, lat, lon, tags);
+ OsmNode node = new OsmNode(id, lat, lon, tags);
+
+ if (hasDenseInfo) {
+ Osmformat.DenseInfo denseInfo = nodes.getDenseinfo();
+
+ // version is NOT delta-encoded
+ node.setVersion(denseInfo.getVersion(idx));
+
+ // timestamp, changeset, uid, user_sid are delta-encoded
+ long timestamp = denseInfo.getTimestamp(idx) + firstTimestamp;
+ long changeset = denseInfo.getChangeset(idx) + firstChangeset;
+ int uid = denseInfo.getUid(idx) + firstUid;
+ int userSid = denseInfo.getUserSid(idx) + firstUserSid;
+
+ firstTimestamp = timestamp;
+ firstChangeset = changeset;
+ firstUid = uid;
+ firstUserSid = userSid;
+
+ node.setTimestamp(timestamp * dateGranularity);
+ node.setChangeset(changeset);
+ node.setUid(uid);
+ if (userSid > 0) {
+ node.setUser(stringTable.getS(userSid).toStringUtf8());
+ }
+
+ // visible is NOT delta-encoded, and may not be present
+ if (denseInfo.getVisibleCount() > idx) {
+ node.setVisible(denseInfo.getVisible(idx));
Review Comment:
Fixed in the same commit — added the `else` branch to default `visible` to
`true`.
##########
spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala:
##########
@@ -232,6 +232,90 @@ class OsmReaderTest extends TestBaseScala with Matchers {
relationsList should contain theSameElementsAs expectedRelationsList
}
+ it("should parse metadata fields (changeset, timestamp, uid, user,
version)") {
+ val osmData = sparkSession.read
+ .format("osmpbf")
+ .load(monacoPath)
+
+ // All entities should have version, timestamp, changeset populated
+ val totalCount = osmData.count()
+ val withMetadata = osmData
+ .filter("version is not null and timestamp is not null and changeset
is not null")
+ .count()
+
+ withMetadata shouldEqual totalCount
+
+ // Verify timestamp values are reasonable (after year 2000, before year
2100)
+ val timestamps = osmData
+ .selectExpr("min(timestamp)", "max(timestamp)")
+ .collect()
+ .head
+
+ val minTimestamp = timestamps.getTimestamp(0)
+ val maxTimestamp = timestamps.getTimestamp(1)
+
+ minTimestamp.after(java.sql.Timestamp.valueOf("2000-01-01 00:00:00"))
shouldBe true
+ maxTimestamp.before(java.sql.Timestamp.valueOf("2100-01-01 00:00:00"))
shouldBe true
+
+ // Verify version is positive
+ val minVersion = osmData
+ .selectExpr("min(version)")
+ .collect()
+ .head
+ .getInt(0)
+
+ minVersion should be >= 1
+
+ // Verify changeset is non-negative
+ val minChangeset = osmData
+ .selectExpr("min(changeset)")
+ .collect()
+ .head
+ .getLong(0)
+
+ minChangeset should be >= 0L
+
+ // Verify metadata works for each entity kind
+ for (kind <- Seq("node", "way", "relation")) {
+ val kindData = osmData.filter(s"kind == '$kind'")
+ val kindWithMeta = kindData
+ .filter("version is not null and timestamp is not null")
+ .count()
+
+ kindWithMeta shouldEqual kindData.count()
+ }
+ }
+
+ it("should include metadata fields in schema for dense nodes") {
+ val denseData = sparkSession.read
+ .format("osmpbf")
+ .load(densePath)
+
+ // Verify schema includes the new fields
+ val fieldNames = denseData.schema.fieldNames
+ fieldNames should contain("changeset")
+ fieldNames should contain("timestamp")
+ fieldNames should contain("uid")
+ fieldNames should contain("user")
+ fieldNames should contain("version")
+ fieldNames should contain("visible")
Review Comment:
Fixed. Added value assertions on a dense node row to verify changeset,
timestamp, uid, user, version, and visible are populated with valid data.
##########
spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/features/InfoResolver.java:
##########
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sedona.sql.datasources.osmpbf.features;
+
+import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
+import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity;
+
+public class InfoResolver {
+
+ public static void populateInfo(
+ OSMEntity entity,
+ Osmformat.Info info,
+ Osmformat.StringTable stringTable,
+ int dateGranularity) {
+ if (info == null) {
+ return;
+ }
+ entity.setVersion(info.getVersion());
+ entity.setTimestamp((long) info.getTimestamp() * dateGranularity);
+ entity.setChangeset(info.getChangeset());
+ entity.setUid(info.getUid());
Review Comment:
Good point. Will address this in a follow-up — need to verify protobuf
`has*` behavior for the generated `Osmformat.Info` class and add test coverage
for partially populated Info blocks.
##########
spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java:
##########
@@ -26,23 +26,42 @@ public class DenseNodeExtractor implements Extractor {
long latOffset;
long lonOffset;
long granularity;
+ int dateGranularity;
long firstId;
long firstLat;
long firstLon;
Integer keyIndex;
+ // DenseInfo delta accumulators
+ boolean hasDenseInfo;
+ long firstTimestamp;
+ long firstChangeset;
+ int firstUid;
+ int firstUserSid;
+
Osmformat.DenseNodes nodes;
public DenseNodeExtractor(
- Osmformat.DenseNodes nodes, long latOffset, long lonOffset, long
granularity) {
+ Osmformat.DenseNodes nodes,
+ long latOffset,
+ long lonOffset,
+ long granularity,
+ int dateGranularity) {
this.firstId = 0;
this.firstLat = 0;
this.firstLon = 0;
this.latOffset = latOffset;
this.lonOffset = lonOffset;
this.granularity = granularity;
+ this.dateGranularity = dateGranularity;
this.nodes = nodes;
this.keyIndex = 0;
+
+ this.hasDenseInfo = nodes.hasDenseinfo() &&
nodes.getDenseinfo().getVersionCount() > 0;
Review Comment:
Fixed. The single `hasDenseInfo` boolean gate is still used to check if
DenseInfo exists at all, but each field is now independently guarded with
`get*Count() > idx`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]