external/liborcus/UnpackedTarball_liborcus.mk              |    4 --
 external/liborcus/orcus-xml-finalize.patch.1               |   13 ++++++++
 filter/Configuration_filter.mk                             |    4 ++
 filter/source/config/cache/typedetection.cxx               |   14 ++++++---
 filter/source/config/fragments/filters/calc_json_Orcus.xcu |   19 +++++++++++++
 filter/source/config/fragments/filters/calc_xml_Orcus.xcu  |   19 +++++++++++++
 filter/source/config/fragments/types/generic_JSON.xcu      |   18 ++++++++++++
 filter/source/config/fragments/types/generic_XML.xcu       |   18 ++++++++++++
 sc/source/filter/orcus/filterdetect.cxx                    |    4 ++
 sc/source/filter/orcus/orcusfiltersimpl.cxx                |    2 +
 sc/source/filter/orcus/xmlcontext.cxx                      |    2 -
 11 files changed, 108 insertions(+), 9 deletions(-)

New commits:
commit cd347097f726eae68fa819fe244d0bedf13832e9
Author:     Kohei Yoshida <[email protected]>
AuthorDate: Thu Aug 21 21:24:37 2025 -0400
Commit:     Kohei Yoshida <[email protected]>
CommitDate: Sat Aug 23 15:41:11 2025 +0200

    Automatically map generic JSON and XML documents to Calc
    
    This includes one bug fix where orcus_xml did not call
    import_factory's finalize() interface method per interface
    contract.  With it properly fixed in liborcus itself, it
    is no longer necessary to manually call it at the end of
    ScOrcusXMLContextImpl::importXML().
    
    Also moving up the type detection order for the generic
    HTML type to be before the generic XML type since something
    like
    
    <html><body>some text</body></html>
    
    can be detected as a valid XML if it is checked first.
    A (well-formed) HTML can be considered a speciazlied XML,
    so it should come before XML.
    
    Change-Id: I20f24706421104181ad3dc8250d93f78d6b611e9
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/190042
    Tested-by: Jenkins
    Reviewed-by: Kohei Yoshida <[email protected]>

diff --git a/external/liborcus/UnpackedTarball_liborcus.mk 
b/external/liborcus/UnpackedTarball_liborcus.mk
index 705feccef8c0..a4df2b30e51c 100644
--- a/external/liborcus/UnpackedTarball_liborcus.mk
+++ b/external/liborcus/UnpackedTarball_liborcus.mk
@@ -13,12 +13,10 @@ $(eval $(call 
gb_UnpackedTarball_set_tarball,liborcus,$(ORCUS_TARBALL)))
 
 $(eval $(call gb_UnpackedTarball_update_autoconf_configs,liborcus))
 
-# 
external/liborcus/0001-const-up-some-things-and-move-them-out-of-data-secti.patch
-# upstream effort as: https://gitlab.com/orcus/orcus/-/merge_requests/225
-
 $(eval $(call gb_UnpackedTarball_add_patches,liborcus,\
        external/liborcus/rpath.patch.0 \
        external/liborcus/libtool.patch.0 \
+       external/liborcus/orcus-xml-finalize.patch.1 \
 ))
 
 ifeq ($(OS),WNT)
diff --git a/external/liborcus/orcus-xml-finalize.patch.1 
b/external/liborcus/orcus-xml-finalize.patch.1
new file mode 100644
index 000000000000..5b5f2e8b97af
--- /dev/null
+++ b/external/liborcus/orcus-xml-finalize.patch.1
@@ -0,0 +1,13 @@
+diff --git a/src/liborcus/orcus_xml.cpp b/src/liborcus/orcus_xml.cpp
+index 9c87a78e..f91538dd 100644
+--- a/src/liborcus/orcus_xml.cpp
++++ b/src/liborcus/orcus_xml.cpp
+@@ -641,6 +641,8 @@ void orcus_xml::read_stream(std::string_view stream)
+ 
+     sax_ns_parser<xml_data_sax_handler> parser(stream, ns_cxt, handler);
+     parser.parse();
++
++    mp_impl->im_factory->finalize();
+ }
+ 
+ #if ORCUS_DEBUG_XML
diff --git a/filter/Configuration_filter.mk b/filter/Configuration_filter.mk
index 5b6bda1d617c..7dcf7c7489d2 100644
--- a/filter/Configuration_filter.mk
+++ b/filter/Configuration_filter.mk
@@ -467,8 +467,10 @@ $(eval $(call 
filter_Configuration_add_types,fcfg_langpack,fcfg_calc_types.xcu,f
        calc_DIF \
        calc_ODS_FlatXML \
        calc_HTML \
+       generic_JSON \
        generic_HTML \
        generic_Text \
+       generic_XML \
        calc_Gnumeric \
        calc_Lotus \
        calc_Parquet \
@@ -528,6 +530,8 @@ $(eval $(call 
filter_Configuration_add_filters,fcfg_langpack,fcfg_calc_filters.x
        StarOffice_XML__Calc_ \
        Text___txt___csv__StarCalc_ \
        calc_csv_Orcus \
+       calc_json_Orcus \
+       calc_xml_Orcus \
        calc_HTML_WebQuery \
        calc_StarOffice_XML_Calc_Template \
        calc_pdf_Export \
diff --git a/filter/source/config/cache/typedetection.cxx 
b/filter/source/config/cache/typedetection.cxx
index 0706e0ab5d16..49efe1c1a9f8 100644
--- a/filter/source/config/cache/typedetection.cxx
+++ b/filter/source/config/cache/typedetection.cxx
@@ -263,6 +263,8 @@ int getFlatTypeRank(std::u16string_view rType)
         "XHTML_File",
         "svg_Scalable_Vector_Graphics",
         "math_MathML_XML_Math",
+        "generic_HTML",
+        "generic_XML",
 
         // Non-compressed text
         "dxf_AutoCAD_Interchange",
@@ -274,7 +276,7 @@ int getFlatTypeRank(std::u16string_view rType)
         "xbm_X_Consortium",
         "writer_Rich_Text_Format",
         "writer_web_HTML_help",
-        "generic_HTML",
+        "generic_JSON",
         "generic_Markdown",
 
         "generic_Text", // Plain text (catch all)
@@ -993,7 +995,7 @@ OUString TypeDetection::impl_detectTypeFlatAndDeep(      
utl::MediaDescriptor& r
     //                                               if no further type could 
be detected.
     //                                               It must be the first one, 
because it can be a preferred type.
     //                                               Our types list was sorted 
by such criteria!
-    // d) detect service return a valid result    => return its decision
+    // d) detect service return a valid result    => return its decision but 
only when it matches the type being tested
     // e) detect service return an invalid result
     //    or any needed information could not be
     //    obtained from the cache                 => ignore it, and continue 
with search
@@ -1041,8 +1043,12 @@ OUString TypeDetection::impl_detectTypeFlatAndDeep(      
utl::MediaDescriptor& r
 
             OUString sDeepType = impl_askDetectService(sDetectService, 
rDescriptor);
 
-            // d)
-            if (!sDeepType.isEmpty())
+            // d) call it 'detected' only when the reported type matches the
+            // type being checked for.  This is important because many 
detectors
+            // report multiple different types which may mess up our strict 
type
+            // check order e.g when the type being tested is of higher
+            // complexity and the detector reports a type of lower complexity.
+            if (sDeepType == sFlatType)
                 return sDeepType;
         }
         catch(const css::container::NoSuchElementException&)
diff --git a/filter/source/config/fragments/filters/calc_json_Orcus.xcu 
b/filter/source/config/fragments/filters/calc_json_Orcus.xcu
new file mode 100644
index 000000000000..22b087c6f8ba
--- /dev/null
+++ b/filter/source/config/fragments/filters/calc_json_Orcus.xcu
@@ -0,0 +1,19 @@
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+-->
+<node oor:name="Orcus JSON" oor:op="replace">
+    <prop oor:name="Flags"><value>IMPORT ALIEN</value></prop>
+    <prop oor:name="UIComponent"/>
+    <prop oor:name="FilterService"/>
+    <prop oor:name="UserData"/>
+    <prop oor:name="Type"><value>generic_JSON</value></prop>
+    <prop oor:name="TemplateName"/>
+    <prop 
oor:name="DocumentService"><value>com.sun.star.sheet.SpreadsheetDocument</value></prop>
+    <prop oor:name="UIName">
+        <value xml:lang="en-US">JSON Document (Calc)</value>
+    </prop>
+</node>
diff --git a/filter/source/config/fragments/filters/calc_xml_Orcus.xcu 
b/filter/source/config/fragments/filters/calc_xml_Orcus.xcu
new file mode 100644
index 000000000000..fa1199b9e19e
--- /dev/null
+++ b/filter/source/config/fragments/filters/calc_xml_Orcus.xcu
@@ -0,0 +1,19 @@
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+-->
+<node oor:name="Orcus XML" oor:op="replace">
+    <prop oor:name="Flags"><value>IMPORT ALIEN</value></prop>
+    <prop oor:name="UIComponent"/>
+    <prop oor:name="FilterService"/>
+    <prop oor:name="UserData"/>
+    <prop oor:name="Type"><value>generic_XML</value></prop>
+    <prop oor:name="TemplateName"/>
+    <prop 
oor:name="DocumentService"><value>com.sun.star.sheet.SpreadsheetDocument</value></prop>
+    <prop oor:name="UIName">
+        <value xml:lang="en-US">XML Document (Calc)</value>
+    </prop>
+</node>
diff --git a/filter/source/config/fragments/types/generic_JSON.xcu 
b/filter/source/config/fragments/types/generic_JSON.xcu
new file mode 100644
index 000000000000..664eb477704a
--- /dev/null
+++ b/filter/source/config/fragments/types/generic_JSON.xcu
@@ -0,0 +1,18 @@
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+-->
+    <node oor:name="generic_JSON" oor:op="replace" >
+        <prop 
oor:name="DetectService"><value>com.sun.star.comp.sc.OrcusFilterDetect</value></prop>
+        <prop oor:name="URLPattern"/>
+        <prop oor:name="Extensions"><value>json</value></prop>
+        <prop oor:name="MediaType"><value>application/json</value></prop>
+        <prop oor:name="Preferred"><value>false</value></prop>
+        <prop oor:name="PreferredFilter"><value>calc_json_Orcus</value></prop>
+        <prop oor:name="UIName"><value>JSON Document</value></prop>
+        <prop oor:name="ClipboardFormat"/>
+    </node>
diff --git a/filter/source/config/fragments/types/generic_XML.xcu 
b/filter/source/config/fragments/types/generic_XML.xcu
new file mode 100644
index 000000000000..897e04e2f21a
--- /dev/null
+++ b/filter/source/config/fragments/types/generic_XML.xcu
@@ -0,0 +1,18 @@
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+-->
+    <node oor:name="generic_XML" oor:op="replace" >
+        <prop 
oor:name="DetectService"><value>com.sun.star.comp.sc.OrcusFilterDetect</value></prop>
+        <prop oor:name="URLPattern"/>
+        <prop oor:name="Extensions"><value>xml</value></prop>
+        <prop oor:name="MediaType"><value>application/xml</value></prop>
+        <prop oor:name="Preferred"><value>false</value></prop>
+        <prop oor:name="PreferredFilter"><value>calc_xml_Orcus</value></prop>
+        <prop oor:name="UIName"><value>XML Document</value></prop>
+        <prop oor:name="ClipboardFormat"/>
+    </node>
diff --git a/sc/source/filter/orcus/filterdetect.cxx 
b/sc/source/filter/orcus/filterdetect.cxx
index 6d04089e7ee3..d8bfc2fe23cc 100644
--- a/sc/source/filter/orcus/filterdetect.cxx
+++ b/sc/source/filter/orcus/filterdetect.cxx
@@ -94,6 +94,10 @@ OUString 
OrcusFormatDetect::detect(css::uno::Sequence<css::beans::PropertyValue>
             return u"calc_MS_Excel_2003_XML"_ustr;
         case orcus::format_t::parquet:
             return u"Apache Parquet"_ustr;
+        case orcus::format_t::xml:
+            return u"generic_XML"_ustr;
+        case orcus::format_t::json:
+            return u"generic_JSON"_ustr;
         default:;
     }
 
diff --git a/sc/source/filter/orcus/orcusfiltersimpl.cxx 
b/sc/source/filter/orcus/orcusfiltersimpl.cxx
index 85a04df3ea2a..c0e2b2facd61 100644
--- a/sc/source/filter/orcus/orcusfiltersimpl.cxx
+++ b/sc/source/filter/orcus/orcusfiltersimpl.cxx
@@ -102,6 +102,8 @@ ScOrcusFilters::ImportResult 
ScOrcusFiltersImpl::importByName(ScDocument& rDoc,
         { "Apache Parquet Spreadsheet", orcus::format_t::parquet },
         { "Gnumeric Spreadsheet", orcus::format_t::gnumeric },
         { "MS Excel 2003 XML Orcus", orcus::format_t::xls_xml },
+        { "Orcus JSON", orcus::format_t::json },
+        { "Orcus XML", orcus::format_t::xml },
         { "Orcus CSV", orcus::format_t::csv },
         { "csv", orcus::format_t::csv },
         { "gnumeric", orcus::format_t::gnumeric },
diff --git a/sc/source/filter/orcus/xmlcontext.cxx 
b/sc/source/filter/orcus/xmlcontext.cxx
index cda6546de0fc..c563cac4eba9 100644
--- a/sc/source/filter/orcus/xmlcontext.cxx
+++ b/sc/source/filter/orcus/xmlcontext.cxx
@@ -264,8 +264,6 @@ void ScOrcusXMLContextImpl::importXML(const 
ScOrcusImportXMLParam& rParam)
 
         orcus::file_content content = toFileContent(aSysPath);
         filter.read_stream(content.str());
-
-        aFactory.finalize();
     }
     catch (const std::exception&)
     {

Reply via email to