This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2610663401 fix tests, revert errant .local-repo setting (#2680)
2610663401 is described below
commit 2610663401dff7c831012fcf7ab589565177f846
Author: Tim Allison <[email protected]>
AuthorDate: Sat Mar 7 07:45:20 2026 -0500
fix tests, revert errant .local-repo setting (#2680)
---
.mvn/maven.config | 2 +-
.../ROOT/pages/advanced/charset-detection-design.adoc | 17 +++++++++++++++++
.../pipes/elasticsearch/tests/ElasticsearchTest.java | 15 ++++++++++-----
.../tika/pipes/solr/tests/TikaPipesSolrTestBase.java | 7 ++++---
4 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/.mvn/maven.config b/.mvn/maven.config
index f354c80bbf..4aa793acab 100644
--- a/.mvn/maven.config
+++ b/.mvn/maven.config
@@ -1,4 +1,4 @@
-Dnisse.compat.osDetector
# fix for nlp module
-DeclipseP2RepoId=invalid
--Dmaven.repo.local=.local_m2_repo
+#-Dmaven.repo.local=.local_m2_repo
diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
index 94f72b1284..19709ed7df 100644
--- a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
+++ b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
@@ -1,3 +1,20 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
= Charset Detection Pipeline
:toc:
:toc-placement: left
diff --git
a/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
b/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
index a4b1ad0f67..89bf0e118a 100644
---
a/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
+++
b/tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
@@ -151,7 +151,9 @@ public class ElasticsearchTest {
"\"from\": 0, \"size\": 1000 }";
results = client.postJson(endpoint + "/_search", query);
assertEquals(200, results.getStatus());
- assertEquals(numHtmlDocs + numTestDocs,
+ // OOM/crash docs (oom.xml, fake_oom.xml) kill the forked JVM and
+ // never emit to the data index, so subtract 2
+ assertEquals(numHtmlDocs + numTestDocs - 2,
results.getJson().get("hits").get("total").get("value")
.asInt());
@@ -219,8 +221,9 @@ public class ElasticsearchTest {
JsonResponse results =
client.postJson(endpoint + "/_search", query);
assertEquals(200, results.getStatus());
- assertEquals(numHtmlDocs + 3 + 12,
- // 3 mock files and the .docx has 11 embedded + itself
+ // 1 mock file (npe.xml emits; oom.xml + fake_oom.xml crash)
+ // + the .docx has 11 embedded + itself = 12
+ assertEquals(numHtmlDocs + 1 + 12,
results.getJson().get("hits").get("total").get("value")
.asInt());
@@ -306,7 +309,9 @@ public class ElasticsearchTest {
"\"match_all\": {} } }";
results = client.postJson(endpoint + "/_search", query);
assertEquals(200, results.getStatus());
- assertEquals(numHtmlDocs + 3 + 12,
+ // 1 mock file (npe.xml emits; oom.xml + fake_oom.xml crash)
+ // + the .docx has 11 embedded + itself = 12
+ assertEquals(numHtmlDocs + 1 + 12,
results.getJson().get("hits").get("total").get("value")
.asInt());
@@ -717,7 +722,7 @@ public class ElasticsearchTest {
throws Exception {
Files.createDirectories(testDocDirectory);
for (int i = 0; i < numHtmlDocs; ++i) {
- String html = "<html><body>" + bodyContent +
+ String html = "<html><head><meta charset=\"UTF-8\"></head><body>"
+ bodyContent +
"</body></html>";
Path p = testDocDirectory.resolve("test-" + i + ".html");
Files.write(p, html.getBytes(StandardCharsets.UTF_8));
diff --git
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
index 75f92c2732..a4fc8f3abc 100644
---
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
+++
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
@@ -164,7 +164,8 @@ public abstract class TikaPipesSolrTestBase {
Files.createDirectories(testFileFolder);
for (int i = 0; i < numDocs; ++i) {
Files.writeString(testFileFolder.resolve("test-" + i + ".html"),
- "<html><body>" + bodyContent + "</body></html>",
StandardCharsets.UTF_8);
+ "<html><head><meta charset=\"UTF-8\"></head><body>" +
bodyContent
+ + "</body></html>", StandardCharsets.UTF_8);
}
try (InputStream is =
this.getClass().getResourceAsStream("/embedded/embedded.docx")) {
Files.copy(is, testFileFolder.resolve("test-embedded.docx"));
@@ -264,7 +265,7 @@ public abstract class TikaPipesSolrTestBase {
try (SolrClient solrClient = new
Http2SolrClient.Builder(solrEndpoint).build()) {
solrClient.commit(collection, true, true);
assertEquals(numDocs, solrClient.query(collection,
- new SolrQuery("mime_s:\"text/html;
charset=ISO-8859-1\"")).getResults()
+ new SolrQuery("mime_s:text/html*")).getResults()
.getNumFound());
assertEquals(numDocs,
solrClient.query(collection, new
SolrQuery("content_s:*initial*")).getResults()
@@ -298,7 +299,7 @@ public abstract class TikaPipesSolrTestBase {
try (SolrClient solrClient = new
Http2SolrClient.Builder(solrEndpoint).build()) {
solrClient.commit(collection, true, true);
assertEquals(numDocs, solrClient.query(collection,
- new SolrQuery("mime_s:\"text/html;
charset=ISO-8859-1\"")).getResults()
+ new SolrQuery("mime_s:text/html*")).getResults()
.getNumFound());
assertEquals(numDocs,
solrClient.query(collection, new
SolrQuery("content_s:*updated*")).getResults()