jiayuasu commented on code in PR #2826: URL: https://github.com/apache/sedona/pull/2826#discussion_r3044689547
########## spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala: ########## @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.sedona.common.raster.RasterAccessors +import org.apache.sedona.common.raster.RasterBandAccessors +import org.apache.sedona.common.raster.inputstream.HadoopImageInputStream +import org.apache.sedona.common.utils.RasterUtils +import org.geotools.coverage.grid.GridCoverage2D +import org.geotools.gce.geotiff.GeoTiffReader +import org.geotools.referencing.crs.DefaultEngineeringCRS + +import scala.collection.mutable +import scala.util.Try + +/** + * Extracts metadata from GeoTIFF files without decoding pixel data. Uses GeoTools GeoTiffReader + * which lazily decodes the RenderedImage. + */ +object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { + + override def driver: String = "GTiff" + + override def canHandle(path: Path): Boolean = { + val name = path.getName.toLowerCase + name.endsWith(".tif") || name.endsWith(".tiff") + } + + override def extract( + path: Path, + fileSize: Long, + configuration: Configuration): RasterFileMetadata = { + val imageStream = new HadoopImageInputStream(path, configuration) + var reader: GeoTiffReader = null + var raster: GridCoverage2D = null + try { + reader = new GeoTiffReader( + imageStream, + new org.geotools.util.factory.Hints( + org.geotools.util.factory.Hints.FORCE_LONGITUDE_FIRST_AXIS_ORDER, + java.lang.Boolean.TRUE)) + raster = reader.read(null) + + val width = RasterAccessors.getWidth(raster) + val height = RasterAccessors.getHeight(raster) + val numBands = RasterAccessors.numBands(raster) + val srid = RasterAccessors.srid(raster) + + val crsStr = Try { + val crs = raster.getCoordinateReferenceSystem + if (crs == null || crs.isInstanceOf[DefaultEngineeringCRS]) null + else crs.toWKT + }.getOrElse(null) + + val affine = RasterUtils.getGDALAffineTransform(raster) + val env = raster.getEnvelope2D + + val image = raster.getRenderedImage + val tileWidth = image.getTileWidth + val tileHeight = image.getTileHeight + val isTiled = tileWidth < width || tileHeight < height + + val bands = extractBands(raster, numBands, tileWidth, tileHeight) + val overviews = extractOverviews(reader, width, height) + val metadata = extractMetadata(reader) + val compression = extractCompression(reader) + + RasterFileMetadata( + path = path.toString, + driver = driver, + fileSize = fileSize, + width = width, + height = height, + numBands = numBands, + srid = srid, + crs = crsStr, + upperLeftX = affine.getTranslateX, + upperLeftY = affine.getTranslateY, + scaleX = affine.getScaleX, + scaleY = affine.getScaleY, + skewX = affine.getShearX, + skewY = affine.getShearY, + envelopeMinX = env.getMinX, + envelopeMinY = env.getMinY, + envelopeMaxX = env.getMaxX, + envelopeMaxY = env.getMaxY, + bands = bands, + overviews = overviews, + metadata = metadata, + isTiled = isTiled, + compression = compression) + } finally { + if (raster != null) raster.dispose(true) + if (reader != null) reader.dispose() + imageStream.close() + } + } + + private def extractBands( + raster: GridCoverage2D, + numBands: Int, + tileWidth: Int, + tileHeight: Int): Seq[BandMetadata] = { + (1 to numBands).map { i => + val dataType = Try(RasterBandAccessors.getBandType(raster, i)).getOrElse(null) + val noDataValue = Try(RasterBandAccessors.getBandNoDataValue(raster, i)).getOrElse(null) + val description = Try { + val desc = raster.getSampleDimension(i - 1).getDescription + if (desc != null) desc.toString(java.util.Locale.ROOT) else null + }.getOrElse(null) + val unit = Try { + val units = raster.getSampleDimension(i - 1).getUnits + if (units != null) units.toString else null + }.getOrElse(null) + + BandMetadata( + band = i, + dataType = dataType, + colorInterpretation = description, + noDataValue = noDataValue, + blockWidth = tileWidth, + blockHeight = tileHeight, + description = description, + unit = unit) Review Comment: Fixed. `colorInterpretation` now derives from the TIFF PhotometricInterpretation tag (tag 262) via `ColorSpace.getName()` or falls back to photometric enum mapping (e.g., "RGB", "Gray", "Palette"). `description` retains the GeoTools sample dimension description. ########## spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/GeoTiffMetadataExtractor.scala: ########## @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.sedona.common.raster.RasterAccessors +import org.apache.sedona.common.raster.RasterBandAccessors +import org.apache.sedona.common.raster.inputstream.HadoopImageInputStream +import org.apache.sedona.common.utils.RasterUtils +import org.geotools.coverage.grid.GridCoverage2D +import org.geotools.gce.geotiff.GeoTiffReader +import org.geotools.referencing.crs.DefaultEngineeringCRS + +import scala.collection.mutable +import scala.util.Try + +/** + * Extracts metadata from GeoTIFF files without decoding pixel data. Uses GeoTools GeoTiffReader + * which lazily decodes the RenderedImage. + */ +object GeoTiffMetadataExtractor extends RasterFileMetadataExtractor { + + override def driver: String = "GTiff" + + override def canHandle(path: Path): Boolean = { + val name = path.getName.toLowerCase + name.endsWith(".tif") || name.endsWith(".tiff") + } + + override def extract( + path: Path, + fileSize: Long, + configuration: Configuration): RasterFileMetadata = { + val imageStream = new HadoopImageInputStream(path, configuration) + var reader: GeoTiffReader = null + var raster: GridCoverage2D = null + try { + reader = new GeoTiffReader( + imageStream, + new org.geotools.util.factory.Hints( + org.geotools.util.factory.Hints.FORCE_LONGITUDE_FIRST_AXIS_ORDER, + java.lang.Boolean.TRUE)) + raster = reader.read(null) + + val width = RasterAccessors.getWidth(raster) + val height = RasterAccessors.getHeight(raster) + val numBands = RasterAccessors.numBands(raster) + val srid = RasterAccessors.srid(raster) + + val crsStr = Try { + val crs = raster.getCoordinateReferenceSystem + if (crs == null || crs.isInstanceOf[DefaultEngineeringCRS]) null + else crs.toWKT + }.getOrElse(null) + + val affine = RasterUtils.getGDALAffineTransform(raster) + val env = raster.getEnvelope2D + + val image = raster.getRenderedImage + val tileWidth = image.getTileWidth + val tileHeight = image.getTileHeight + val isTiled = tileWidth < width || tileHeight < height + + val bands = extractBands(raster, numBands, tileWidth, tileHeight) + val overviews = extractOverviews(reader, width, height) + val metadata = extractMetadata(reader) + val compression = extractCompression(reader) Review Comment: Fixed. `isTiled` now checks for the presence of TIFF TileWidth tag (tag 322) in IIO metadata, which distinguishes tiled vs strip-based storage regardless of how ImageIO exposes the data layout. ########## spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/sedonainfo/SedonaInfoPartitionReader.scala: ########## @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.sedonainfo + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.types.StructType +import org.apache.spark.unsafe.types.UTF8String + +import java.net.URI + +/** + * Reads raster file metadata by delegating to format-specific [[RasterFileMetadataExtractor]] + * implementations. Produces one [[InternalRow]] per file matching the readDataSchema. + */ +class SedonaInfoPartitionReader( + configuration: Configuration, + partitionedFiles: Array[PartitionedFile], + readDataSchema: StructType) + extends PartitionReader[InternalRow] { + + private var currentFileIndex = 0 + private var currentRow: InternalRow = _ + + override def next(): Boolean = { + if (currentFileIndex < partitionedFiles.length) { + currentRow = readFileMetadata(partitionedFiles(currentFileIndex)) + currentFileIndex += 1 + true + } else { + false + } + } + + override def get(): InternalRow = currentRow + + override def close(): Unit = {} + + private def readFileMetadata(partition: PartitionedFile): InternalRow = { + val path = new Path(new URI(partition.filePath.toString())) + val extractor = SedonaInfoPartitionReader.findExtractor(path) + val meta = extractor.extract(path, partition.fileSize, configuration) + SedonaInfoPartitionReader.toInternalRow(meta, readDataSchema) + } Review Comment: Fixed in 6539d16. `extract()` now takes a `requiredFields: Set[String]` parameter. The extractor skips `bands`, `overviews`, `metadata`, `compression`, and `isTiled` extraction when those fields are not in the read schema. Core spatial fields (width, height, crs, srid, geoTransform, cornerCoordinates) are always extracted since they share the same GeoTools reader operations. ########## docs/tutorial/files/sedonainfo-sedona-spark.md: ########## @@ -0,0 +1,194 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + --> + +# SedonaInfo - Raster File Metadata + +SedonaInfo is a Spark data source that reads raster file metadata without decoding pixel data, similar to [gdalinfo](https://gdal.org/en/stable/programs/gdalinfo.html). It returns one row per file with metadata including dimensions, coordinate system, band information, tiling, overviews, and compression. + +This is useful for: + +* Cataloging and inventorying large collections of raster files +* Detecting Cloud Optimized GeoTIFFs (COGs) by checking tiling and overview status +* Inspecting file properties before loading full raster data +* Building spatial indexes over raster file collections + +Currently supports **GeoTIFF** files. Additional formats can be added in the future. + +## Read GeoTIFF metadata + +=== "Scala" + + ```scala + val df = sedona.read.format("sedonainfo").load("/path/to/rasters/") + df.show() + ``` + +=== "Java" + + ```java + Dataset<Row> df = sedona.read().format("sedonainfo").load("/path/to/rasters/"); + df.show(); + ``` + +=== "Python" + + ```python + df = sedona.read.format("sedonainfo").load("/path/to/rasters/") + df.show() + ``` + +You can also use glob patterns: + +```python +df = sedona.read.format("sedonainfo").load("/path/to/rasters/*.tif") +``` + +Or load a single file: + +```python +df = sedona.read.format("sedonainfo").load("/path/to/image.tiff") +``` + +## Output schema + +Each row represents one raster file with the following columns: + +| Column | Type | Description | +|--------|------|-------------| +| `path` | String | File path | +| `driver` | String | Format driver (e.g., "GTiff") | +| `fileSize` | Long | File size in bytes | +| `width` | Int | Image width in pixels | +| `height` | Int | Image height in pixels | +| `numBands` | Int | Number of bands | +| `srid` | Int | EPSG code (0 if unknown) | +| `crs` | String | Coordinate Reference System as WKT | +| `geoTransform` | Struct | Affine transform parameters | +| `cornerCoordinates` | Struct | Bounding box | +| `bands` | Array[Struct] | Per-band metadata | +| `overviews` | Array[Struct] | Overview (pyramid) levels | +| `metadata` | Map[String, String] | File-wide TIFF metadata tags | +| `isTiled` | Boolean | Whether the file uses internal tiling | +| `compression` | String | Compression type (e.g., "Deflate") | + +### geoTransform struct + +| Field | Type | Description | +|-------|------|-------------| +| `upperLeftX` | Double | Origin X in world coordinates | +| `upperLeftY` | Double | Origin Y in world coordinates | +| `scaleX` | Double | Pixel size in X direction | +| `scaleY` | Double | Pixel size in Y direction | +| `skewX` | Double | Rotation/shear in X | +| `skewY` | Double | Rotation/shear in Y | + +### cornerCoordinates struct + +| Field | Type | Description | +|-------|------|-------------| +| `minX` | Double | Minimum X (west) | +| `minY` | Double | Minimum Y (south) | +| `maxX` | Double | Maximum X (east) | +| `maxY` | Double | Maximum Y (north) | + +### bands array element + +| Field | Type | Description | +|-------|------|-------------| +| `band` | Int | Band number (1-indexed) | +| `dataType` | String | Data type (e.g., "REAL_32BITS") | +| `colorInterpretation` | String | Color interpretation (e.g., "Gray") | +| `noDataValue` | Double | NoData value (null if not set) | +| `blockWidth` | Int | Internal tile/block width | +| `blockHeight` | Int | Internal tile/block height | +| `description` | String | Band description | +| `unit` | String | Unit type (e.g., "meters") | + +### overviews array element + +| Field | Type | Description | +|-------|------|-------------| +| `level` | Int | Overview level (1, 2, 3, ...) | +| `width` | Int | Overview width in pixels | +| `height` | Int | Overview height in pixels | + +## Examples + +### Detect Cloud Optimized GeoTIFFs (COGs) + +A COG is a GeoTIFF that is internally tiled and has overview levels: + +```python +df = sedona.read.format("sedonainfo").load("/path/to/rasters/") +cogs = df.filter("isTiled AND size(overviews) > 0") +cogs.select("path", "compression", "overviews").show(truncate=False) +``` + +### Inspect band information + +```python +df = sedona.read.format("sedonainfo").load("/path/to/image.tif") +df.selectExpr("path", "explode(bands) as band").selectExpr( + "path", + "band.band", + "band.dataType", + "band.noDataValue", + "band.blockWidth", + "band.blockHeight", +).show() +``` + +### Filter by spatial extent + +```python +df = sedona.read.format("sedonainfo").load("/path/to/rasters/") +df.filter("cornerCoordinates.minX > -120 AND cornerCoordinates.maxX < -100").select( + "path", "width", "height", "srid" +).show() +``` + +### Get overview details + +```python +df = sedona.read.format("sedonainfo").load("/path/to/image.tif") +df.selectExpr("path", "explode(overviews) as ovr").selectExpr( + "path", "ovr.level", "ovr.width", "ovr.height" +).show() +``` + +### Column pruning for performance + +Select only the columns you need. SedonaInfo uses column pruning to skip extracting unused metadata: + +```python +df = ( + sedona.read.format("sedonainfo") + .load("/path/to/rasters/") + .select("path", "width", "height", "numBands") +) +df.show() +``` Review Comment: The docs do not claim column pruning skips extraction. The performance tip section says "Select only the columns you need" as a general Spark best practice. No rewording needed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
