jiayuasu commented on code in PR #2826: URL: https://github.com/apache/sedona/pull/2826#discussion_r3044756968
########## spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala: ########## @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.types.StructType +import org.apache.spark.unsafe.types.UTF8String + +import java.net.URI + +/** + * Reads raster file metadata by delegating to format-specific [[RasterFileMetadataExtractor]] + * implementations. Produces one [[InternalRow]] per file matching the readDataSchema. + */ +class SedonaInfoPartitionReader( + configuration: Configuration, + partitionedFiles: Array[PartitionedFile], + readDataSchema: StructType) + extends PartitionReader[InternalRow] { + + private var currentFileIndex = 0 + private var currentRow: InternalRow = _ + + override def next(): Boolean = { + if (currentFileIndex < partitionedFiles.length) { + currentRow = readFileMetadata(partitionedFiles(currentFileIndex)) + currentFileIndex += 1 + true + } else { + false + } + } + + override def get(): InternalRow = currentRow + + override def close(): Unit = {} + + private def readFileMetadata(partition: PartitionedFile): InternalRow = { + val path = new Path(new URI(partition.filePath.toString())) + val extractor = SedonaInfoPartitionReader.findExtractor(path) + val requiredFields = readDataSchema.fieldNames.toSet + val meta = extractor.extract(path, partition.fileSize, configuration, requiredFields) + SedonaInfoPartitionReader.toInternalRow(meta, readDataSchema) + } +} + +object SedonaInfoPartitionReader { + + /** Registered metadata extractors. Add new format extractors here. */ + private val extractors: Seq[RasterFileMetadataExtractor] = Seq(GeoTiffMetadataExtractor) + + def findExtractor(path: Path): RasterFileMetadataExtractor = { + extractors + .find(_.canHandle(path)) + .getOrElse( + throw new UnsupportedOperationException( + s"No metadata extractor found for file: ${path.getName}. " + + s"Supported formats: ${extractors.map(_.driver).mkString(", ")}")) + } + + def toInternalRow(meta: RasterFileMetadata, readDataSchema: StructType): InternalRow = { + val gt = meta.geoTransform + val geoTransformRow = new GenericInternalRow( + Array[Any](gt.upperLeftX, gt.upperLeftY, gt.scaleX, gt.scaleY, gt.skewX, gt.skewY)) + + val cc = meta.cornerCoordinates + val cornerCoordinatesRow = + new GenericInternalRow(Array[Any](cc.minX, cc.minY, cc.maxX, cc.maxY)) + + lazy val bandsArray: ArrayData = { + val bands = meta.bands.map { b => + new GenericInternalRow( + Array[Any]( + b.band, + if (b.dataType != null) UTF8String.fromString(b.dataType) else null, + if (b.colorInterpretation != null) UTF8String.fromString(b.colorInterpretation) + else null, + if (b.noDataValue != null) b.noDataValue.doubleValue() else null, + b.blockWidth, + b.blockHeight, + if (b.description != null) UTF8String.fromString(b.description) else null, + if (b.unit != null) UTF8String.fromString(b.unit) else null)) + }.toArray + new GenericArrayData(bands) + } + + lazy val overviewsArray: ArrayData = { + val overviews = meta.overviews.map { o => + new GenericInternalRow(Array[Any](o.level, o.width, o.height)) + }.toArray + new GenericArrayData(overviews) + } + + lazy val metadataMap: Any = { + if (meta.metadata.isEmpty) null + else { + org.apache.spark.sql.catalyst.util.ArrayBasedMapData( + meta.metadata.keys.map(UTF8String.fromString).toArray, + meta.metadata.values.map(UTF8String.fromString).toArray) Review Comment: Fixed. Now uses `toSeq` to iterate entries once and build key/value arrays from the same traversal. ########## docs/tutorial/files/sedonainfo-sedona-spark.md: ########## @@ -0,0 +1,194 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + --> + +# SedonaInfo - Raster File Metadata + +SedonaInfo is a Spark data source that reads raster file metadata without decoding pixel data, similar to [gdalinfo](https://gdal.org/en/stable/programs/gdalinfo.html). It returns one row per file with metadata including dimensions, coordinate system, band information, tiling, overviews, and compression. + +This is useful for: + +* Cataloging and inventorying large collections of raster files +* Detecting Cloud Optimized GeoTIFFs (COGs) by checking tiling and overview status +* Inspecting file properties before loading full raster data +* Building spatial indexes over raster file collections + +Currently supports **GeoTIFF** files. Additional formats can be added in the future. + +## Read GeoTIFF metadata Review Comment: NetCDF support was moved to a separate PR (#2829). This PR covers GeoTIFF only. ########## spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoDataSource.scala: ########## @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.connector.catalog.TableProvider +import org.apache.spark.sql.execution.datasources.FileFormat +import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 +import org.apache.spark.sql.sedona_sql.io.raster.RasterFileFormat +import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import scala.collection.JavaConverters._ + +/** + * A Spark SQL data source that reads raster file metadata without decoding pixel data, similar to + * gdalinfo. Currently supports GeoTIFF files. Additional formats can be added by implementing + * [[RasterFileMetadataExtractor]]. Review Comment: NetCDF support was moved to a separate PR (#2829). This PR covers GeoTIFF only. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
