external/liborcus/UnpackedTarball_liborcus.mk | 4 -- external/liborcus/orcus-xml-finalize.patch.1 | 13 ++++++++ filter/Configuration_filter.mk | 4 ++ filter/source/config/cache/typedetection.cxx | 14 ++++++--- filter/source/config/fragments/filters/calc_json_Orcus.xcu | 19 +++++++++++++ filter/source/config/fragments/filters/calc_xml_Orcus.xcu | 19 +++++++++++++ filter/source/config/fragments/types/generic_JSON.xcu | 18 ++++++++++++ filter/source/config/fragments/types/generic_XML.xcu | 18 ++++++++++++ sc/source/filter/orcus/filterdetect.cxx | 4 ++ sc/source/filter/orcus/orcusfiltersimpl.cxx | 2 + sc/source/filter/orcus/xmlcontext.cxx | 2 - 11 files changed, 108 insertions(+), 9 deletions(-)
New commits: commit cd347097f726eae68fa819fe244d0bedf13832e9 Author: Kohei Yoshida <[email protected]> AuthorDate: Thu Aug 21 21:24:37 2025 -0400 Commit: Kohei Yoshida <[email protected]> CommitDate: Sat Aug 23 15:41:11 2025 +0200 Automatically map generic JSON and XML documents to Calc This includes one bug fix where orcus_xml did not call import_factory's finalize() interface method per interface contract. With it properly fixed in liborcus itself, it is no longer necessary to manually call it at the end of ScOrcusXMLContextImpl::importXML(). Also moving up the type detection order for the generic HTML type to be before the generic XML type since something like <html><body>some text</body></html> can be detected as a valid XML if it is checked first. A (well-formed) HTML can be considered a speciazlied XML, so it should come before XML. Change-Id: I20f24706421104181ad3dc8250d93f78d6b611e9 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/190042 Tested-by: Jenkins Reviewed-by: Kohei Yoshida <[email protected]> diff --git a/external/liborcus/UnpackedTarball_liborcus.mk b/external/liborcus/UnpackedTarball_liborcus.mk index 705feccef8c0..a4df2b30e51c 100644 --- a/external/liborcus/UnpackedTarball_liborcus.mk +++ b/external/liborcus/UnpackedTarball_liborcus.mk @@ -13,12 +13,10 @@ $(eval $(call gb_UnpackedTarball_set_tarball,liborcus,$(ORCUS_TARBALL))) $(eval $(call gb_UnpackedTarball_update_autoconf_configs,liborcus)) -# external/liborcus/0001-const-up-some-things-and-move-them-out-of-data-secti.patch -# upstream effort as: https://gitlab.com/orcus/orcus/-/merge_requests/225 - $(eval $(call gb_UnpackedTarball_add_patches,liborcus,\ external/liborcus/rpath.patch.0 \ external/liborcus/libtool.patch.0 \ + external/liborcus/orcus-xml-finalize.patch.1 \ )) ifeq ($(OS),WNT) diff --git a/external/liborcus/orcus-xml-finalize.patch.1 b/external/liborcus/orcus-xml-finalize.patch.1 new file mode 100644 index 000000000000..5b5f2e8b97af --- /dev/null +++ b/external/liborcus/orcus-xml-finalize.patch.1 @@ -0,0 +1,13 @@ +diff --git a/src/liborcus/orcus_xml.cpp b/src/liborcus/orcus_xml.cpp +index 9c87a78e..f91538dd 100644 +--- a/src/liborcus/orcus_xml.cpp ++++ b/src/liborcus/orcus_xml.cpp +@@ -641,6 +641,8 @@ void orcus_xml::read_stream(std::string_view stream) + + sax_ns_parser<xml_data_sax_handler> parser(stream, ns_cxt, handler); + parser.parse(); ++ ++ mp_impl->im_factory->finalize(); + } + + #if ORCUS_DEBUG_XML diff --git a/filter/Configuration_filter.mk b/filter/Configuration_filter.mk index 5b6bda1d617c..7dcf7c7489d2 100644 --- a/filter/Configuration_filter.mk +++ b/filter/Configuration_filter.mk @@ -467,8 +467,10 @@ $(eval $(call filter_Configuration_add_types,fcfg_langpack,fcfg_calc_types.xcu,f calc_DIF \ calc_ODS_FlatXML \ calc_HTML \ + generic_JSON \ generic_HTML \ generic_Text \ + generic_XML \ calc_Gnumeric \ calc_Lotus \ calc_Parquet \ @@ -528,6 +530,8 @@ $(eval $(call filter_Configuration_add_filters,fcfg_langpack,fcfg_calc_filters.x StarOffice_XML__Calc_ \ Text___txt___csv__StarCalc_ \ calc_csv_Orcus \ + calc_json_Orcus \ + calc_xml_Orcus \ calc_HTML_WebQuery \ calc_StarOffice_XML_Calc_Template \ calc_pdf_Export \ diff --git a/filter/source/config/cache/typedetection.cxx b/filter/source/config/cache/typedetection.cxx index 0706e0ab5d16..49efe1c1a9f8 100644 --- a/filter/source/config/cache/typedetection.cxx +++ b/filter/source/config/cache/typedetection.cxx @@ -263,6 +263,8 @@ int getFlatTypeRank(std::u16string_view rType) "XHTML_File", "svg_Scalable_Vector_Graphics", "math_MathML_XML_Math", + "generic_HTML", + "generic_XML", // Non-compressed text "dxf_AutoCAD_Interchange", @@ -274,7 +276,7 @@ int getFlatTypeRank(std::u16string_view rType) "xbm_X_Consortium", "writer_Rich_Text_Format", "writer_web_HTML_help", - "generic_HTML", + "generic_JSON", "generic_Markdown", "generic_Text", // Plain text (catch all) @@ -993,7 +995,7 @@ OUString TypeDetection::impl_detectTypeFlatAndDeep( utl::MediaDescriptor& r // if no further type could be detected. // It must be the first one, because it can be a preferred type. // Our types list was sorted by such criteria! - // d) detect service return a valid result => return its decision + // d) detect service return a valid result => return its decision but only when it matches the type being tested // e) detect service return an invalid result // or any needed information could not be // obtained from the cache => ignore it, and continue with search @@ -1041,8 +1043,12 @@ OUString TypeDetection::impl_detectTypeFlatAndDeep( utl::MediaDescriptor& r OUString sDeepType = impl_askDetectService(sDetectService, rDescriptor); - // d) - if (!sDeepType.isEmpty()) + // d) call it 'detected' only when the reported type matches the + // type being checked for. This is important because many detectors + // report multiple different types which may mess up our strict type + // check order e.g when the type being tested is of higher + // complexity and the detector reports a type of lower complexity. + if (sDeepType == sFlatType) return sDeepType; } catch(const css::container::NoSuchElementException&) diff --git a/filter/source/config/fragments/filters/calc_json_Orcus.xcu b/filter/source/config/fragments/filters/calc_json_Orcus.xcu new file mode 100644 index 000000000000..22b087c6f8ba --- /dev/null +++ b/filter/source/config/fragments/filters/calc_json_Orcus.xcu @@ -0,0 +1,19 @@ +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. +--> +<node oor:name="Orcus JSON" oor:op="replace"> + <prop oor:name="Flags"><value>IMPORT ALIEN</value></prop> + <prop oor:name="UIComponent"/> + <prop oor:name="FilterService"/> + <prop oor:name="UserData"/> + <prop oor:name="Type"><value>generic_JSON</value></prop> + <prop oor:name="TemplateName"/> + <prop oor:name="DocumentService"><value>com.sun.star.sheet.SpreadsheetDocument</value></prop> + <prop oor:name="UIName"> + <value xml:lang="en-US">JSON Document (Calc)</value> + </prop> +</node> diff --git a/filter/source/config/fragments/filters/calc_xml_Orcus.xcu b/filter/source/config/fragments/filters/calc_xml_Orcus.xcu new file mode 100644 index 000000000000..fa1199b9e19e --- /dev/null +++ b/filter/source/config/fragments/filters/calc_xml_Orcus.xcu @@ -0,0 +1,19 @@ +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. +--> +<node oor:name="Orcus XML" oor:op="replace"> + <prop oor:name="Flags"><value>IMPORT ALIEN</value></prop> + <prop oor:name="UIComponent"/> + <prop oor:name="FilterService"/> + <prop oor:name="UserData"/> + <prop oor:name="Type"><value>generic_XML</value></prop> + <prop oor:name="TemplateName"/> + <prop oor:name="DocumentService"><value>com.sun.star.sheet.SpreadsheetDocument</value></prop> + <prop oor:name="UIName"> + <value xml:lang="en-US">XML Document (Calc)</value> + </prop> +</node> diff --git a/filter/source/config/fragments/types/generic_JSON.xcu b/filter/source/config/fragments/types/generic_JSON.xcu new file mode 100644 index 000000000000..664eb477704a --- /dev/null +++ b/filter/source/config/fragments/types/generic_JSON.xcu @@ -0,0 +1,18 @@ +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * +--> + <node oor:name="generic_JSON" oor:op="replace" > + <prop oor:name="DetectService"><value>com.sun.star.comp.sc.OrcusFilterDetect</value></prop> + <prop oor:name="URLPattern"/> + <prop oor:name="Extensions"><value>json</value></prop> + <prop oor:name="MediaType"><value>application/json</value></prop> + <prop oor:name="Preferred"><value>false</value></prop> + <prop oor:name="PreferredFilter"><value>calc_json_Orcus</value></prop> + <prop oor:name="UIName"><value>JSON Document</value></prop> + <prop oor:name="ClipboardFormat"/> + </node> diff --git a/filter/source/config/fragments/types/generic_XML.xcu b/filter/source/config/fragments/types/generic_XML.xcu new file mode 100644 index 000000000000..897e04e2f21a --- /dev/null +++ b/filter/source/config/fragments/types/generic_XML.xcu @@ -0,0 +1,18 @@ +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * +--> + <node oor:name="generic_XML" oor:op="replace" > + <prop oor:name="DetectService"><value>com.sun.star.comp.sc.OrcusFilterDetect</value></prop> + <prop oor:name="URLPattern"/> + <prop oor:name="Extensions"><value>xml</value></prop> + <prop oor:name="MediaType"><value>application/xml</value></prop> + <prop oor:name="Preferred"><value>false</value></prop> + <prop oor:name="PreferredFilter"><value>calc_xml_Orcus</value></prop> + <prop oor:name="UIName"><value>XML Document</value></prop> + <prop oor:name="ClipboardFormat"/> + </node> diff --git a/sc/source/filter/orcus/filterdetect.cxx b/sc/source/filter/orcus/filterdetect.cxx index 6d04089e7ee3..d8bfc2fe23cc 100644 --- a/sc/source/filter/orcus/filterdetect.cxx +++ b/sc/source/filter/orcus/filterdetect.cxx @@ -94,6 +94,10 @@ OUString OrcusFormatDetect::detect(css::uno::Sequence<css::beans::PropertyValue> return u"calc_MS_Excel_2003_XML"_ustr; case orcus::format_t::parquet: return u"Apache Parquet"_ustr; + case orcus::format_t::xml: + return u"generic_XML"_ustr; + case orcus::format_t::json: + return u"generic_JSON"_ustr; default:; } diff --git a/sc/source/filter/orcus/orcusfiltersimpl.cxx b/sc/source/filter/orcus/orcusfiltersimpl.cxx index 85a04df3ea2a..c0e2b2facd61 100644 --- a/sc/source/filter/orcus/orcusfiltersimpl.cxx +++ b/sc/source/filter/orcus/orcusfiltersimpl.cxx @@ -102,6 +102,8 @@ ScOrcusFilters::ImportResult ScOrcusFiltersImpl::importByName(ScDocument& rDoc, { "Apache Parquet Spreadsheet", orcus::format_t::parquet }, { "Gnumeric Spreadsheet", orcus::format_t::gnumeric }, { "MS Excel 2003 XML Orcus", orcus::format_t::xls_xml }, + { "Orcus JSON", orcus::format_t::json }, + { "Orcus XML", orcus::format_t::xml }, { "Orcus CSV", orcus::format_t::csv }, { "csv", orcus::format_t::csv }, { "gnumeric", orcus::format_t::gnumeric }, diff --git a/sc/source/filter/orcus/xmlcontext.cxx b/sc/source/filter/orcus/xmlcontext.cxx index cda6546de0fc..c563cac4eba9 100644 --- a/sc/source/filter/orcus/xmlcontext.cxx +++ b/sc/source/filter/orcus/xmlcontext.cxx @@ -264,8 +264,6 @@ void ScOrcusXMLContextImpl::importXML(const ScOrcusImportXMLParam& rParam) orcus::file_content content = toFileContent(aSysPath); filter.read_stream(content.str()); - - aFactory.finalize(); } catch (const std::exception&) {
