(tika) branch main updated: Update pipes docs (#2759)

tallison Thu, 09 Apr 2026 18:09:58 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 9e1efa4d73 Update pipes docs (#2759)
9e1efa4d73 is described below

commit 9e1efa4d731d9d91e6409ce72f8a170adc23bf4e
Author: Tim Allison <[email protected]>
AuthorDate: Thu Apr 9 21:07:48 2026 -0400

    Update pipes docs (#2759)
---
 docs/modules/ROOT/examples/pipes-fs-emitter.json   |   1 +
 docs/modules/ROOT/examples/pipes-fs-fetcher.json   |   1 +
 docs/modules/ROOT/examples/pipes-fs-pipeline.json  |   1 +
 docs/modules/ROOT/nav.adoc                         |   6 +
 docs/modules/ROOT/pages/pipes/configuration.adoc   | 152 +++++++++++++
 docs/modules/ROOT/pages/pipes/emitters.adoc        | 220 ++++++++++++++++++
 docs/modules/ROOT/pages/pipes/fetchers.adoc        | 245 +++++++++++++++++++++
 docs/modules/ROOT/pages/pipes/getting-started.adoc | 135 ++++++++++++
 docs/modules/ROOT/pages/pipes/index.adoc           |  40 ++--
 docs/modules/ROOT/pages/pipes/iterators.adoc       | 212 ++++++++++++++++++
 docs/modules/ROOT/pages/pipes/reporters.adoc       |  93 ++++++++
 .../ROOT/pages/using-tika/java-api/index.adoc      |  51 ++++-
 docs/supplemental-ui/partials/toolbar.hbs          |   3 +
 13 files changed, 1141 insertions(+), 19 deletions(-)

diff --git a/docs/modules/ROOT/examples/pipes-fs-emitter.json 
b/docs/modules/ROOT/examples/pipes-fs-emitter.json
new file mode 120000
index 0000000000..a9321db9eb
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-fs-emitter.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-fs-fetcher.json 
b/docs/modules/ROOT/examples/pipes-fs-fetcher.json
new file mode 120000
index 0000000000..faef8e27a1
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-fs-fetcher.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-fs-pipeline.json 
b/docs/modules/ROOT/examples/pipes-fs-pipeline.json
new file mode 120000
index 0000000000..5a7538b141
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-fs-pipeline.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index ea3e9726a9..819f9e2098 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -21,6 +21,12 @@
 ** xref:using-tika/cli/index.adoc[Command Line]
 ** xref:using-tika/grpc/index.adoc[gRPC]
 * xref:pipes/index.adoc[Pipes]
+** xref:pipes/getting-started.adoc[Getting Started]
+** xref:pipes/fetchers.adoc[Fetchers]
+** xref:pipes/emitters.adoc[Emitters]
+** xref:pipes/iterators.adoc[Iterators]
+** xref:pipes/reporters.adoc[Reporters]
+** xref:pipes/configuration.adoc[Pipeline Configuration]
 ** xref:pipes/parse-modes.adoc[Parse Modes]
 ** xref:pipes/unpack-config.adoc[Extracting Embedded Bytes]
 ** xref:pipes/timeouts.adoc[Timeouts]
diff --git a/docs/modules/ROOT/pages/pipes/configuration.adoc 
b/docs/modules/ROOT/pages/pipes/configuration.adoc
new file mode 100644
index 0000000000..7204d39589
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/configuration.adoc
@@ -0,0 +1,152 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Pipes Configuration
+
+The `pipes` section of the JSON config controls the pipeline process itself:
+how many forked JVMs to run, timeouts, memory management, and parse behavior.
+
+[source,json]
+----
+{
+  "pipes": {
+    "numClients": 4,
+    "socketTimeoutMs": 60000,
+    "maxFilesProcessedPerProcess": 10000,
+    "parseMode": "RMETA",
+    "onParseException": "EMIT",
+    "forkedJvmArgs": ["-Xmx512m"]
+  }
+}
+----
+
+== Process Management
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`numClients`
+|`4`
+|Number of parallel forked JVM processes. Each processes one document at a 
time.
+
+|`forkedJvmArgs`
+|`[]`
+|JVM arguments for forked processes (e.g., `["-Xmx512m", "-Xms256m"]`).
+
+|`javaPath`
+|`java`
+|Path to the Java executable for forked processes.
+
+|`maxFilesProcessedPerProcess`
+|`10000`
+|Restart forked processes after this many files. Prevents slow-building memory 
leaks in parsing libraries.
+
+|`tempDirectory`
+|_system default_
+|Directory for temporary files. Consider a RAM-backed filesystem (e.g., 
`/dev/shm`) for better performance.
+|===
+
+== Timeouts
+
+See also xref:pipes/timeouts.adoc[Timeouts] for the full timeout model.
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`socketTimeoutMs`
+|`60000`
+|Maximum time (ms) to wait for data from a forked process. If no heartbeat or 
result is received within this window, the parse is considered hung.
+
+|`heartbeatIntervalMs`
+|`1000`
+|Interval (ms) between heartbeats sent from the forked process. Must be 
significantly less than `socketTimeoutMs`.
+
+|`startupTimeoutMillis`
+|`240000`
+|Maximum time (ms) to wait for a forked process to start up.
+
+|`shutdownClientAfterMillis`
+|`300000`
+|Shut down an idle forked process after this many milliseconds of inactivity.
+
+|`maxWaitForClientMillis`
+|`60000`
+|Maximum time (ms) to wait for an available forked process when all are busy.
+|===
+
+== Parse Behavior
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`parseMode`
+|`RMETA`
+|How embedded documents are handled: `RMETA` (recursive metadata list), 
`CONCATENATE`, `CONTENT_ONLY`, `UNPACK`. See xref:pipes/parse-modes.adoc[Parse 
Modes].
+
+|`onParseException`
+|`EMIT`
+|What to do when a parse fails: `EMIT` (emit error metadata) or `SKIP` 
(silently skip).
+
+|`stopOnlyOnFatal`
+|`false`
+|When `false`, stop the pipeline on configuration errors (missing 
fetcher/emitter). When `true`, only stop on fatal initialization failures. Use 
`true` for server mode, `false` for batch mode.
+|===
+
+== Async / Emit Batching
+
+These settings control how parsed results are batched before sending to 
emitters.
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`numEmitters`
+|`1`
+|Number of emitter threads.
+
+|`queueSize`
+|`10000`
+|Size of the fetch/emit tuple queue.
+
+|`emitWithinMillis`
+|`10000`
+|Flush the emit batch if nothing has been emitted within this many 
milliseconds, even if the batch is not full.
+
+|`emitMaxEstimatedBytes`
+|`100000`
+|Flush the emit batch when the estimated size reaches this many bytes.
+
+|`emitIntermediateResults`
+|`false`
+|Emit partial results as they become available (rather than waiting for the 
full parse to complete).
+|===
+
+== Shared Server Mode (Experimental)
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`useSharedServer`
+|`false`
+|When `true`, multiple clients share a single forked JVM instead of each 
having its own. Reduces memory overhead but sacrifices isolation -- one crash 
affects all in-flight requests. **Not recommended for production.**
+|===
+
+See xref:pipes/shared-server-mode.adoc[Shared Server Mode] for details.
diff --git a/docs/modules/ROOT/pages/pipes/emitters.adoc 
b/docs/modules/ROOT/pages/pipes/emitters.adoc
new file mode 100644
index 0000000000..3feeb8ebf3
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/emitters.adoc
@@ -0,0 +1,220 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Emitters
+
+Emitters write parsed results to a destination. Each emitter is identified by
+its component name and an `id` that is referenced by the pipes iterator.
+
+== File System Emitter (`file-system-emitter`)
+
+Writes parsed metadata as JSON files to a local or mounted filesystem.
+
+**Module:** `tika-pipes-file-system`
+
+[source,json,subs=none]
+----
+include::example$pipes-fs-emitter.json[]
+----
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`basePath`
+|_required_
+|Base output directory.
+
+|`fileExtension`
+|`json`
+|Extension for output files.
+
+|`onExists`
+|`EXCEPTION`
+|Behavior when output file exists: `SKIP`, `REPLACE`, `EXCEPTION`.
+
+|`prettyPrint`
+|`false`
+|Pretty-print JSON output.
+|===
+
+== Elasticsearch Emitter (`es-emitter`)
+
+Sends parsed documents to Elasticsearch via the `_bulk` API. Uses plain HTTP --
+no dependency on the ES Java client.
+
+**Module:** `tika-pipes-es`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`esUrl`
+|_required_
+|Full URL including index (e.g., `https://localhost:9200/my-index`).
+
+|`idField`
+|`_id`
+|Metadata field used as the document `_id`.
+
+|`apiKey`
+|_none_
+|Base64-encoded `id:api_key` for authentication.
+
+|`attachmentStrategy`
+|`SEPARATE_DOCUMENTS`
+|`SEPARATE_DOCUMENTS` or `PARENT_CHILD`.
+
+|`updateStrategy`
+|`OVERWRITE`
+|`OVERWRITE` (full replace) or `UPSERT` (field-level merge).
+
+|`embeddedFileFieldName`
+|`embedded`
+|Join-field name for `PARENT_CHILD` mode.
+|===
+
+== OpenSearch Emitter (`opensearch-emitter`)
+
+Sends documents to OpenSearch. Configured identically to the ES emitter
+but uses `openSearchUrl` instead of `esUrl`.
+
+**Module:** `tika-pipes-opensearch`
+
+== S3 Emitter (`s3-emitter`)
+
+Writes parsed metadata as JSON objects to Amazon S3.
+
+**Module:** `tika-pipes-s3`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`bucket`
+|_required_
+|S3 bucket name.
+
+|`region`
+|_required_
+|AWS region.
+
+|`prefix`
+|_none_
+|S3 key prefix for output objects.
+
+|`credentialsProvider`
+|`profile`
+|Credentials type: `profile`, `static`, `instance`.
+
+|`fileExtension`
+|`json`
+|File extension for output keys.
+|===
+
+== GCS Emitter (`gcs-emitter`)
+
+Writes parsed metadata to Google Cloud Storage.
+
+**Module:** `tika-pipes-gcs`
+
+== Azure Blob Emitter (`az-blob-emitter`)
+
+Writes parsed metadata to Azure Blob Storage.
+
+**Module:** `tika-pipes-az-blob`
+
+== Solr Emitter (`solr-emitter`)
+
+Indexes parsed documents into Apache Solr.
+
+**Module:** `tika-pipes-solr`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`solrCollection`
+|_required_
+|Solr collection name.
+
+|`solrUrls`
+|_required_
+|List of Solr URLs.
+
+|`idField`
+|`id`
+|Field name for document ID.
+
+|`commitWithin`
+|`-1`
+|Milliseconds before auto-commit (-1 = server default).
+
+|`attachmentStrategy`
+|`SEPARATE_DOCUMENTS`
+|How to handle embedded documents.
+|===
+
+== JDBC Emitter (`jdbc-emitter`)
+
+Writes parsed metadata to a SQL database via JDBC.
+
+**Module:** `tika-pipes-jdbc`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`connection`
+|_required_
+|JDBC connection string.
+
+|`insert`
+|_required_
+|SQL INSERT statement with `?` placeholders.
+
+|`keys`
+|_required_
+|Ordered list of metadata keys to bind to placeholders.
+|===
+
+== Kafka Emitter (`kafka-emitter`)
+
+Sends parsed metadata as messages to Apache Kafka.
+
+**Module:** `tika-pipes-kafka`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`topic`
+|_required_
+|Kafka topic name.
+
+|`bootstrapServers`
+|_required_
+|Kafka broker addresses.
+
+|`acks`
+|`all`
+|Acknowledgment requirement.
+
+|`lingerMs`
+|`0`
+|Batch wait time in milliseconds.
+|===
diff --git a/docs/modules/ROOT/pages/pipes/fetchers.adoc 
b/docs/modules/ROOT/pages/pipes/fetchers.adoc
new file mode 100644
index 0000000000..eff355d0de
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/fetchers.adoc
@@ -0,0 +1,245 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Fetchers
+
+Fetchers retrieve document bytes from a source. Each fetcher is identified by
+its component name and an `id` that is referenced by the pipes iterator.
+
+== File System Fetcher (`file-system-fetcher`)
+
+Reads files from a local or mounted filesystem.
+
+**Module:** `tika-pipes-file-system`
+
+[source,json,subs=none]
+----
+include::example$pipes-fs-fetcher.json[]
+----
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`basePath`
+|_required_
+|Base directory. Fetch keys are resolved relative to this path.
+
+|`extractFileSystemMetadata`
+|`false`
+|Extract file created/modified timestamps and size into metadata.
+
+|`allowAbsolutePaths`
+|`false`
+|Allow absolute fetch keys when `basePath` is not set.
+|===
+
+== S3 Fetcher (`s3-fetcher`)
+
+Fetches objects from Amazon S3.
+
+**Module:** `tika-pipes-s3`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`bucket`
+|_required_
+|S3 bucket name.
+
+|`region`
+|_required_
+|AWS region (e.g., `us-east-1`).
+
+|`credentialsProvider`
+|`profile`
+|Credentials type: `profile`, `static`, `instance`.
+
+|`profile`
+|`default`
+|AWS profile name (when using `profile` credentials).
+
+|`accessKey` / `secretKey`
+|_none_
+|Static credentials (when using `static` credentials).
+
+|`prefix`
+|_none_
+|S3 key prefix.
+
+|`spoolToTemp`
+|`false`
+|Spool object to a temp file before parsing.
+
+|`extractUserMetadata`
+|`false`
+|Extract S3 user metadata.
+
+|`maxLength`
+|_unlimited_
+|Maximum object size to fetch.
+|===
+
+== HTTP Fetcher (`http-fetcher`)
+
+Fetches documents from HTTP/HTTPS URLs.
+
+**Module:** `tika-pipes-http`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`userName`
+|_none_
+|Basic auth username.
+
+|`password`
+|_none_
+|Basic auth password.
+
+|`connectTimeoutMillis`
+|`30000`
+|Connection timeout.
+
+|`socketTimeoutMillis`
+|`120000`
+|Socket read timeout.
+
+|`maxConnections`
+|`200`
+|Maximum concurrent connections.
+
+|`userAgent`
+|_default_
+|HTTP User-Agent header.
+|===
+
+== GCS Fetcher (`gcs-fetcher`)
+
+Fetches objects from Google Cloud Storage.
+
+**Module:** `tika-pipes-gcs`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`projectId`
+|_required_
+|GCP project ID.
+
+|`bucket`
+|_required_
+|GCS bucket name.
+
+|`prefix`
+|_none_
+|Key prefix.
+
+|`spoolToTemp`
+|`false`
+|Spool to temp file before parsing.
+
+|`extractUserMetadata`
+|`false`
+|Extract GCS user metadata.
+|===
+
+== Azure Blob Fetcher (`az-blob-fetcher`)
+
+Fetches blobs from Azure Blob Storage.
+
+**Module:** `tika-pipes-az-blob`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`sasToken`
+|_required_
+|Shared Access Signature token.
+
+|`endpoint`
+|_required_
+|Azure storage endpoint URL.
+
+|`container`
+|_required_
+|Container name.
+
+|`prefix`
+|_none_
+|Blob prefix.
+
+|`extractUserMetadata`
+|`false`
+|Extract Azure user metadata.
+|===
+
+== Google Drive Fetcher (`google-drive-fetcher`)
+
+Fetches files from Google Drive via the Drive API.
+
+**Module:** `tika-pipes-google-drive`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`serviceAccountCredentialsPath`
+|_required_
+|Path to GCP service account JSON key file.
+
+|`impersonatedUser`
+|_none_
+|User email to impersonate (for domain-wide delegation).
+|===
+
+== Microsoft Graph Fetcher (`microsoft-graph-fetcher`)
+
+Fetches files from Microsoft 365 (OneDrive, SharePoint) via the Graph API.
+
+**Module:** `tika-pipes-microsoft-graph`
+
+== Atlassian JWT Fetcher (`atlassian-jwt-fetcher`)
+
+Fetches content from Atlassian products using JWT authentication.
+
+**Module:** `tika-pipes-atlassian-jwt`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`sharedSecret`
+|_required_
+|JWT shared secret.
+
+|`issuer`
+|_required_
+|JWT issuer / app key.
+
+|`connectTimeoutMillis`
+|`30000`
+|Connection timeout.
+
+|`socketTimeoutMillis`
+|`120000`
+|Socket read timeout.
+|===
diff --git a/docs/modules/ROOT/pages/pipes/getting-started.adoc 
b/docs/modules/ROOT/pages/pipes/getting-started.adoc
new file mode 100644
index 0000000000..6ee6c45148
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/getting-started.adoc
@@ -0,0 +1,135 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Getting Started with Tika Pipes
+
+This guide walks through a complete working example: reading files from
+a directory, parsing them, and writing JSON metadata to an output directory.
+
+== Quick Start with tika-app
+
+The simplest way to use Tika Pipes is through `tika-app`:
+
+[source,bash]
+----
+java -jar tika-app.jar -i /data/input -o /data/output
+----
+
+This recursively processes all files in `/data/input` and writes one `.json`
+file per document to `/data/output`. Each JSON file contains the extracted
+metadata and text content.
+
+=== Common Options
+
+[source,bash]
+----
+# Use 4 parallel forked processes
+java -jar tika-app.jar -i /data/input -o /data/output -n 4
+
+# Set memory limit per forked process
+java -jar tika-app.jar -i /data/input -o /data/output -n 4 -X 512m
+
+# Set parse timeout (milliseconds)
+java -jar tika-app.jar -i /data/input -o /data/output -T 120000
+
+# Extract plain text only (no HTML)
+java -jar tika-app.jar -i /data/input -o /data/output --handler t
+
+# Recursively unpack all embedded documents
+java -jar tika-app.jar -i /data/input -o /data/output -Z
+----
+
+Handler types: `t` (text), `h` (html), `x` (xml), `m` (markdown), `b` (body), 
`i` (ignore/metadata only).
+
+== JSON Configuration
+
+For more control, create a JSON config file. Here is a complete 
filesystem-to-filesystem
+pipeline:
+
+[source,json,subs=none]
+----
+include::example$pipes-fs-pipeline.json[]
+----
+icon:github[] 
https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json[View
 source on GitHub]
+
+Run it with:
+
+[source,bash]
+----
+java -jar tika-app.jar --config tika-config.json -i /data/input -o /data/output
+----
+
+NOTE: The `-i` and `-o` flags override the `basePath` values in the config 
when used
+with tika-app. The config file is useful for setting other options like 
`extractFileSystemMetadata`,
+`onExists`, and `prettyPrint`.
+
+== How It Works
+
+A Tika Pipes pipeline has four components:
+
+1. **Pipes Iterator** -- enumerates the documents to process (e.g., walk a 
directory, list an S3 bucket, query a database)
+2. **Fetcher** -- retrieves each document's bytes (e.g., read from filesystem, 
download from S3)
+3. **Parsers** -- extract text and metadata (runs in a forked JVM for 
robustness)
+4. **Emitter** -- writes the results (e.g., JSON to filesystem, index to 
Elasticsearch)
+
+[source]
+----
+Iterator --> Fetcher --> [forked JVM: Parse] --> Emitter
+----
+
+Each parse runs in an isolated forked process with configurable timeouts and 
memory
+limits. If a parse hangs or crashes, only that forked process is affected -- 
the
+pipeline continues with the remaining documents.
+
+== Pipeline Configuration Options
+
+The `pipes` section controls the pipeline behavior:
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`numClients`
+|`4`
+|Number of parallel forked parse processes
+
+|`parseMode`
+|`RMETA`
+|Output mode: `RMETA` (full recursive metadata), `CONCATENATE`, 
`CONTENT_ONLY`, `UNPACK`
+
+|`socketTimeoutMs`
+|`60000`
+|Maximum time (ms) for a single parse operation
+
+|`maxFilesProcessedPerProcess`
+|`10000`
+|Restart forked processes after this many files (prevents memory leaks)
+
+|`onParseException`
+|`EMIT`
+|What to do on parse failure: `EMIT` (emit error metadata), `SKIP`
+|===
+
+See xref:pipes/parse-modes.adoc[Parse Modes] and 
xref:pipes/timeouts.adoc[Timeouts]
+for details.
+
+== Next Steps
+
+* xref:pipes/fetchers.adoc[Fetchers] -- all available document sources
+* xref:pipes/emitters.adoc[Emitters] -- all available output destinations
+* xref:pipes/iterators.adoc[Iterators] -- all available document enumeration 
methods
+* xref:pipes/reporters.adoc[Reporters] -- track processing status
diff --git a/docs/modules/ROOT/pages/pipes/index.adoc 
b/docs/modules/ROOT/pages/pipes/index.adoc
index a6fea020ae..796f9d7f1f 100644
--- a/docs/modules/ROOT/pages/pipes/index.adoc
+++ b/docs/modules/ROOT/pages/pipes/index.adoc
@@ -21,24 +21,36 @@ This section covers Tika Pipes for scalable, fault-tolerant 
document processing.
 
 == Overview
 
-Tika Pipes provides a framework for processing large volumes of documents with:
+Tika Pipes provides a framework for fault-tolerant, scalable document 
processing.
+Each document is parsed in a forked JVM with configurable timeouts and memory 
limits,
+so a single malformed file cannot crash or hang your application.
 
-* **Fetchers** - Retrieve documents from various sources (filesystem, S3, 
HTTP, etc.)
-* **Emitters** - Send parsed results to various destinations (filesystem, 
OpenSearch, ES-compatible, Solr, etc.)
-* **Pipelines** - Configure processing workflows
+While Tika Pipes has a programmatic Java API, it is best used through:
 
-== Topics
+* xref:using-tika/cli/index.adoc[tika-app] — batch processing from the command 
line
+* xref:using-tika/server/index.adoc[tika-server] — REST API with pipes-based 
robustness built in
+* xref:using-tika/grpc/index.adoc[tika-grpc] — gRPC API with pipes-based 
robustness built in
+
+See xref:advanced/robustness.adoc[Robustness] for details on how Tika Pipes 
protects
+against problematic files.
+
+=== Key Components
 
-* xref:pipes/parse-modes.adoc[Parse Modes] - Control how documents are parsed 
and emitted (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`, `UNPACK`)
-* xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] - Extract raw bytes 
from embedded documents using `ParseMode.UNPACK`
-* xref:pipes/timeouts.adoc[Timeouts] - Two-tier timeout system for handling 
long-running and hung parsers
+* **Fetchers** — retrieve documents from various sources (filesystem, S3, 
HTTP, etc.)
+* **Emitters** — send parsed results to various destinations (filesystem, 
OpenSearch, ES-compatible, Solr, etc.)
+* **Pipelines** — configure processing workflows
+
+== Topics
 
-// Add links to specific topics as they are created
-// * link:getting-started.html[Getting Started]
-// * link:fetchers.html[Fetchers]
-// * link:emitters.html[Emitters]
-// * link:configuration.html[Configuration]
-// * link:async.html[Async Processing]
+* xref:pipes/getting-started.adoc[Getting Started] -- complete working example 
with tika-app
+* xref:pipes/fetchers.adoc[Fetchers] -- all available document sources 
(filesystem, S3, HTTP, GCS, Azure, etc.)
+* xref:pipes/emitters.adoc[Emitters] -- all available output destinations 
(filesystem, ES, OpenSearch, Solr, S3, Kafka, etc.)
+* xref:pipes/iterators.adoc[Iterators] -- document enumeration (directory 
walk, S3 listing, CSV, JDBC, Kafka, etc.)
+* xref:pipes/reporters.adoc[Reporters] -- track per-document processing status
+* xref:pipes/configuration.adoc[Pipeline Configuration] -- numClients, 
timeouts, JVM args, parse modes, emit batching
+* xref:pipes/parse-modes.adoc[Parse Modes] -- control how documents are parsed 
and emitted (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`, `UNPACK`)
+* xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] -- extract raw 
bytes from embedded documents
+* xref:pipes/timeouts.adoc[Timeouts] -- two-tier timeout system for handling 
long-running and hung parsers
 
 == Emitters
 
diff --git a/docs/modules/ROOT/pages/pipes/iterators.adoc 
b/docs/modules/ROOT/pages/pipes/iterators.adoc
new file mode 100644
index 0000000000..dc433bb492
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/iterators.adoc
@@ -0,0 +1,212 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Pipes Iterators
+
+Pipes iterators enumerate the documents to be processed. Each iterator
+produces fetch/emit tuples that the pipeline consumes.
+
+All iterators share a `baseConfig` block that specifies which fetcher and 
emitter
+to use:
+
+[source,json]
+----
+"baseConfig": {
+  "fetcherId": "my-fetcher-id",
+  "emitterId": "my-emitter-id"
+}
+----
+
+== File System Iterator (`file-system-pipes-iterator`)
+
+Recursively walks a directory tree.
+
+**Module:** `tika-pipes-file-system`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`basePath`
+|_required_
+|Directory to walk.
+
+|`countTotal`
+|`false`
+|Count total files before processing (enables progress reporting).
+
+|`baseConfig`
+|_required_
+|Fetcher/emitter IDs.
+|===
+
+== S3 Iterator (`s3-pipes-iterator`)
+
+Lists objects in an S3 bucket.
+
+**Module:** `tika-pipes-s3`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`bucket`
+|_required_
+|S3 bucket name.
+
+|`region`
+|_required_
+|AWS region.
+
+|`prefix`
+|_none_
+|Key prefix to filter objects.
+
+|`credentialsProvider`
+|`profile`
+|Credentials type.
+
+|`baseConfig`
+|_required_
+|Fetcher/emitter IDs.
+|===
+
+== GCS Iterator (`gcs-pipes-iterator`)
+
+Lists objects in a Google Cloud Storage bucket.
+
+**Module:** `tika-pipes-gcs`
+
+== Azure Blob Iterator (`az-blob-pipes-iterator`)
+
+Lists blobs in an Azure Blob Storage container.
+
+**Module:** `tika-pipes-az-blob`
+
+== CSV Iterator (`csv-pipes-iterator`)
+
+Reads rows from a CSV file to generate fetch/emit tuples.
+
+**Module:** `tika-pipes-csv`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`csvPath`
+|_required_
+|Path to the CSV file.
+
+|`fetchKeyColumn`
+|_required_
+|Column name containing the fetch key (file path, S3 key, etc.).
+
+|`emitKeyColumn`
+|_none_
+|Column name for the emit key. If omitted, uses the fetch key.
+
+|`baseConfig`
+|_required_
+|Fetcher/emitter IDs.
+|===
+
+== JDBC Iterator (`jdbc-pipes-iterator`)
+
+Executes a SQL query and uses each row as a fetch/emit tuple.
+
+**Module:** `tika-pipes-jdbc`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`connection`
+|_required_
+|JDBC connection string.
+
+|`select`
+|_required_
+|SQL SELECT query.
+
+|`fetchKeyColumn`
+|_required_
+|Column containing the fetch key.
+
+|`idColumn`
+|_none_
+|Column containing the document ID.
+
+|`baseConfig`
+|_required_
+|Fetcher/emitter IDs.
+|===
+
+== Solr Iterator (`solr-pipes-iterator`)
+
+Queries a Solr collection and uses each document as a fetch/emit tuple.
+
+**Module:** `tika-pipes-solr`
+
+== JSON Iterator (`json-pipes-iterator`)
+
+Reads an array of objects from a JSON file.
+
+**Module:** `tika-pipes-json`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`jsonPath`
+|_required_
+|Path to the JSON file.
+
+|`baseConfig`
+|_required_
+|Fetcher/emitter IDs.
+|===
+
+== Kafka Iterator (`kafka-pipes-iterator`)
+
+Consumes messages from a Kafka topic as fetch/emit tuples.
+
+**Module:** `tika-pipes-kafka`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`topic`
+|_required_
+|Kafka topic.
+
+|`bootstrapServers`
+|_required_
+|Kafka broker addresses.
+
+|`groupId`
+|_required_
+|Consumer group ID.
+
+|`autoOffsetReset`
+|`earliest`
+|Where to start reading: `earliest` or `latest`.
+
+|`baseConfig`
+|_required_
+|Fetcher/emitter IDs.
+|===
diff --git a/docs/modules/ROOT/pages/pipes/reporters.adoc 
b/docs/modules/ROOT/pages/pipes/reporters.adoc
new file mode 100644
index 0000000000..3994ede95e
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/reporters.adoc
@@ -0,0 +1,93 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Pipes Reporters
+
+Reporters track the processing status of each document in the pipeline.
+They record whether a parse succeeded, failed, or timed out, along with
+timing information.
+
+== File System Reporter (`file-system-reporter`)
+
+Writes a JSON status file that is updated periodically.
+
+**Module:** `tika-pipes-file-system`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`statusFile`
+|_required_
+|Path to the JSON status file.
+
+|`reportUpdateMs`
+|`1000`
+|How often to update the status file (milliseconds).
+|===
+
+== JDBC Reporter (`jdbc-reporter`)
+
+Writes per-document status to a SQL database table.
+
+**Module:** `tika-pipes-jdbc`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`connectionString`
+|_required_
+|JDBC connection string.
+
+|`tableName`
+|_required_
+|Table name for status records.
+
+|`createTable`
+|`false`
+|Auto-create the table if it does not exist.
+|===
+
+== Elasticsearch Reporter (`es-pipes-reporter`)
+
+Writes per-document parse status back into the Elasticsearch index via upsert.
+
+**Module:** `tika-pipes-es`
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`esUrl`
+|_required_
+|Elasticsearch endpoint (including index).
+
+|`keyPrefix`
+|`tika_`
+|Prefix for status fields (e.g., `tika_parse_status`).
+
+|`includeRouting`
+|`false`
+|Include routing in upsert requests.
+|===
+
+== OpenSearch Reporter (`opensearch-pipes-reporter`)
+
+Same as the ES reporter but for OpenSearch. Uses `openSearchUrl` instead of 
`esUrl`.
+
+**Module:** `tika-pipes-opensearch`
diff --git a/docs/modules/ROOT/pages/using-tika/java-api/index.adoc 
b/docs/modules/ROOT/pages/using-tika/java-api/index.adoc
index 4853446d50..22844404a0 100644
--- a/docs/modules/ROOT/pages/using-tika/java-api/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/java-api/index.adoc
@@ -24,9 +24,49 @@ This section covers using Apache Tika programmatically in 
your Java applications
 Tika can be embedded directly into your Java applications as a library. This 
gives you
 full control over parsing, detection, and configuration.
 
-However, for most use cases we recommend using 
xref:using-tika/server/index.adoc[tika-server]
-or xref:using-tika/grpc/index.adoc[tika-grpc] instead. See
-xref:using-tika/java-api/getting-started.adoc[Getting Started] for guidance on 
choosing the right approach.
+IMPORTANT: Some file formats can trigger excessive memory use, infinite loops, 
or JVM
+crashes in the underlying parsing libraries. For production systems processing 
untrusted
+files, use xref:pipes/index.adoc[Tika Pipes] which runs each parse in a forked 
JVM with
+timeouts and memory limits. Alternatively, 
xref:using-tika/server/index.adoc[tika-server]
+and xref:using-tika/grpc/index.adoc[tika-grpc] provide the same robustness as 
a service.
+See xref:advanced/robustness.adoc[Robustness] for details.
+
+== Dependencies
+
+Add the following to your `pom.xml`:
+
+[source,xml,subs=attributes+]
+----
+<dependency>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parsers-standard-package</artifactId>
+    <version>{tika-version}</version>
+</dependency>
+----
+
+This pulls in `tika-core` and all standard parsers (PDF, Office, HTML, etc.).
+
+If you only need detection (no parsing) or want to select parsers individually:
+
+[source,xml,subs=attributes+]
+----
+<dependency>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-core</artifactId>
+    <version>{tika-version}</version>
+</dependency>
+----
+
+To use `TikaLoader` for JSON-based configuration, also add:
+
+[source,xml,subs=attributes+]
+----
+<dependency>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-serialization</artifactId>
+    <version>{tika-version}</version>
+</dependency>
+----
 
 == Parsers
 
@@ -165,14 +205,15 @@ container detection becomes available.
 
 [source,java]
 ----
-TikaConfig tika = new TikaConfig();
+TikaLoader loader = TikaLoader.loadDefault();
+Detector detector = loader.loadDetectors();
 ParseContext parseContext = new ParseContext();
 
 for (Path p : myListOfPaths) {
     Metadata metadata = new Metadata();
 
     try (TikaInputStream stream = TikaInputStream.get(p, metadata)) {
-        MediaType mimetype = tika.getDetector().detect(stream, metadata, 
parseContext);
+        MediaType mimetype = detector.detect(stream, metadata, parseContext);
         System.out.println("File " + p + " is " + mimetype);
     }
 }
diff --git a/docs/supplemental-ui/partials/toolbar.hbs 
b/docs/supplemental-ui/partials/toolbar.hbs
new file mode 100644
index 0000000000..c3fc82b802
--- /dev/null
+++ b/docs/supplemental-ui/partials/toolbar.hbs
@@ -0,0 +1,3 @@
+<div class="toolbar" role="navigation">
+<button class="nav-toggle"></button>
+</div>

(tika) branch main updated: Update pipes docs (#2759)

Reply via email to