This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2610663401 fix tests, revert errant .local-repo setting (#2680)
2610663401 is described below

commit 2610663401dff7c831012fcf7ab589565177f846
Author: Tim Allison <[email protected]>
AuthorDate: Sat Mar 7 07:45:20 2026 -0500

    fix tests, revert errant .local-repo setting (#2680)
---
 .mvn/maven.config                                       |  2 +-
 .../ROOT/pages/advanced/charset-detection-design.adoc   | 17 +++++++++++++++++
 .../pipes/elasticsearch/tests/ElasticsearchTest.java    | 15 ++++++++++-----
 .../tika/pipes/solr/tests/TikaPipesSolrTestBase.java    |  7 ++++---
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/.mvn/maven.config b/.mvn/maven.config
index f354c80bbf..4aa793acab 100644
--- a/.mvn/maven.config
+++ b/.mvn/maven.config
@@ -1,4 +1,4 @@
 -Dnisse.compat.osDetector
 # fix for nlp module
 -DeclipseP2RepoId=invalid
--Dmaven.repo.local=.local_m2_repo
+#-Dmaven.repo.local=.local_m2_repo
diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc 
b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
index 94f72b1284..19709ed7df 100644
--- a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
+++ b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
@@ -1,3 +1,20 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
 = Charset Detection Pipeline
 :toc:
 :toc-placement: left
diff --git 
a/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
 
b/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
index a4b1ad0f67..89bf0e118a 100644
--- 
a/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
+++ 
b/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
@@ -151,7 +151,9 @@ public class ElasticsearchTest {
                 "\"from\": 0, \"size\": 1000 }";
         results = client.postJson(endpoint + "/_search", query);
         assertEquals(200, results.getStatus());
-        assertEquals(numHtmlDocs + numTestDocs,
+        // OOM/crash docs (oom.xml, fake_oom.xml) kill the forked JVM and
+        // never emit to the data index, so subtract 2
+        assertEquals(numHtmlDocs + numTestDocs - 2,
                 results.getJson().get("hits").get("total").get("value")
                         .asInt());
 
@@ -219,8 +221,9 @@ public class ElasticsearchTest {
         JsonResponse results =
                 client.postJson(endpoint + "/_search", query);
         assertEquals(200, results.getStatus());
-        assertEquals(numHtmlDocs + 3 + 12,
-                // 3 mock files and the .docx has 11 embedded + itself
+        // 1 mock file (npe.xml emits; oom.xml + fake_oom.xml crash)
+        // + the .docx has 11 embedded + itself = 12
+        assertEquals(numHtmlDocs + 1 + 12,
                 results.getJson().get("hits").get("total").get("value")
                         .asInt());
 
@@ -306,7 +309,9 @@ public class ElasticsearchTest {
                 "\"match_all\": {} } }";
         results = client.postJson(endpoint + "/_search", query);
         assertEquals(200, results.getStatus());
-        assertEquals(numHtmlDocs + 3 + 12,
+        // 1 mock file (npe.xml emits; oom.xml + fake_oom.xml crash)
+        // + the .docx has 11 embedded + itself = 12
+        assertEquals(numHtmlDocs + 1 + 12,
                 results.getJson().get("hits").get("total").get("value")
                         .asInt());
 
@@ -717,7 +722,7 @@ public class ElasticsearchTest {
             throws Exception {
         Files.createDirectories(testDocDirectory);
         for (int i = 0; i < numHtmlDocs; ++i) {
-            String html = "<html><body>" + bodyContent +
+            String html = "<html><head><meta charset=\"UTF-8\"></head><body>" 
+ bodyContent +
                     "</body></html>";
             Path p = testDocDirectory.resolve("test-" + i + ".html");
             Files.write(p, html.getBytes(StandardCharsets.UTF_8));
diff --git 
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
 
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
index 75f92c2732..a4fc8f3abc 100644
--- 
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
+++ 
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
@@ -164,7 +164,8 @@ public abstract class TikaPipesSolrTestBase {
         Files.createDirectories(testFileFolder);
         for (int i = 0; i < numDocs; ++i) {
             Files.writeString(testFileFolder.resolve("test-" + i + ".html"),
-                    "<html><body>" + bodyContent + "</body></html>", 
StandardCharsets.UTF_8);
+                    "<html><head><meta charset=\"UTF-8\"></head><body>" + 
bodyContent
+                            + "</body></html>", StandardCharsets.UTF_8);
         }
         try (InputStream is = 
this.getClass().getResourceAsStream("/embedded/embedded.docx")) {
             Files.copy(is, testFileFolder.resolve("test-embedded.docx"));
@@ -264,7 +265,7 @@ public abstract class TikaPipesSolrTestBase {
         try (SolrClient solrClient = new 
Http2SolrClient.Builder(solrEndpoint).build()) {
             solrClient.commit(collection, true, true);
             assertEquals(numDocs, solrClient.query(collection,
-                            new SolrQuery("mime_s:\"text/html; 
charset=ISO-8859-1\"")).getResults()
+                            new SolrQuery("mime_s:text/html*")).getResults()
                     .getNumFound());
             assertEquals(numDocs,
                     solrClient.query(collection, new 
SolrQuery("content_s:*initial*")).getResults()
@@ -298,7 +299,7 @@ public abstract class TikaPipesSolrTestBase {
         try (SolrClient solrClient = new 
Http2SolrClient.Builder(solrEndpoint).build()) {
             solrClient.commit(collection, true, true);
             assertEquals(numDocs, solrClient.query(collection,
-                            new SolrQuery("mime_s:\"text/html; 
charset=ISO-8859-1\"")).getResults()
+                            new SolrQuery("mime_s:text/html*")).getResults()
                     .getNumFound());
             assertEquals(numDocs,
                     solrClient.query(collection, new 
SolrQuery("content_s:*updated*")).getResults()

Reply via email to