[gentoo-commits] repo/gentoo:master commit in: sci-libs/datasets/files/, sci-libs/datasets/

Alfredo Tupone Wed, 21 Feb 2024 13:08:02 -0800

commit:     7938c21e268aff71e1d091dfbce1bfba8bde8308
Author:     Alfredo Tupone <tupone <AT> gentoo <DOT> org>
AuthorDate: Wed Feb 21 21:07:04 2024 +0000
Commit:     Alfredo Tupone <tupone <AT> gentoo <DOT> org>
CommitDate: Wed Feb 21 21:07:36 2024 +0000
URL:        https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=7938c21e


sci-libs/datasets: drop test that require network

Closes: https://bugs.gentoo.org/925171
Signed-off-by: Alfredo Tupone <tupone <AT> gentoo.org>

 sci-libs/datasets/datasets-2.16.0.ebuild           |  14 ++
 .../datasets/files/datasets-2.16.0-tests.patch     | 160 +++++++++++++--------
 2 files changed, 116 insertions(+), 58 deletions(-)

diff --git a/sci-libs/datasets/datasets-2.16.0.ebuild 
b/sci-libs/datasets/datasets-2.16.0.ebuild
index 0325b5ae63d6..a34fcaa2f89c 100644
--- a/sci-libs/datasets/datasets-2.16.0.ebuild
+++ b/sci-libs/datasets/datasets-2.16.0.ebuild
@@ -66,4 +66,18 @@ src_prepare() {
        sed -i -e \
                "/pyarrow_hotfix/d" \
                src/datasets/features/features.py || die
+       sed -i \
+               -e "s:pytest.mark.integration:pytest.mark.skip():g" \
+               tests/test_arrow_dataset.py \
+               tests/test_fingerprint.py \
+               tests/test_hf_gcp.py \
+               tests/test_inspect.py \
+               tests/test_iterable_dataset.py \
+               tests/test_iterable_dataset.py \
+               tests/test_load.py \
+               tests/test_offline_util.py \
+               tests/test_streaming_download_manager.py \
+               tests/commands/test_test.py \
+               tests/packaged_modules/test_cache.py \
+               die
 }

diff --git a/sci-libs/datasets/files/datasets-2.16.0-tests.patch 
b/sci-libs/datasets/files/datasets-2.16.0-tests.patch
index 6b2845bce168..8cb89e824b3b 100644
--- a/sci-libs/datasets/files/datasets-2.16.0-tests.patch
+++ b/sci-libs/datasets/files/datasets-2.16.0-tests.patch
@@ -10,51 +10,72 @@
      ],
 --- a/tests/test_load.py       2024-02-20 22:12:13.699209107 +0100
 +++ b/tests/test_load.py       2024-02-20 22:13:10.862626708 +0100
-@@ -386,21 +386,6 @@
+@@ -386,6 +386,7 @@
              hf_modules_cache=self.hf_modules_cache,
          )
  
--    def test_HubDatasetModuleFactoryWithScript_dont_trust_remote_code(self):
--        # "squad" has a dataset script
--        factory = HubDatasetModuleFactoryWithScript(
--            "squad", download_config=self.download_config, 
dynamic_modules_path=self.dynamic_modules_path
--        )
--        with patch.object(config, "HF_DATASETS_TRUST_REMOTE_CODE", None):  # 
this will be the default soon
--            self.assertRaises(ValueError, factory.get_module)
--        factory = HubDatasetModuleFactoryWithScript(
--            "squad",
--            download_config=self.download_config,
--            dynamic_modules_path=self.dynamic_modules_path,
--            trust_remote_code=False,
--        )
--        self.assertRaises(ValueError, factory.get_module)
--
++    @pytest.mark.skip(reason="")
+     def test_HubDatasetModuleFactoryWithScript_dont_trust_remote_code(self):
+         # "squad" has a dataset script
+         factory = HubDatasetModuleFactoryWithScript(
+@@ -402,6 +402,7 @@
+         )
+         self.assertRaises(ValueError, factory.get_module)
+ 
++    @pytest.mark.skip()
      def test_HubDatasetModuleFactoryWithScript_with_github_dataset(self):
          # "wmt_t2t" has additional imports (internal)
          factory = HubDatasetModuleFactoryWithScript(
-@@ -1235,12 +1235,6 @@
- 
- 
- @pytest.mark.integration
--def test_load_streaming_private_dataset_with_zipped_data(hf_token, 
hf_private_dataset_repo_zipped_txt_data):
--    ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, 
streaming=True, token=hf_token)
--    assert next(iter(ds)) is not None
--
--
[email protected]
- def test_load_dataset_config_kwargs_passed_as_arguments():
-     ds_default = load_dataset(SAMPLE_DATASET_IDENTIFIER4)
-     ds_custom = load_dataset(SAMPLE_DATASET_IDENTIFIER4, drop_metadata=True)
+@@ -411,6 +412,7 @@
+         assert importlib.import_module(module_factory_result.module_path) is 
not None
+         assert 
module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
+ 
++    @pytest.mark.skip()
+     def test_GithubMetricModuleFactory_with_internal_import(self):
+         # "squad_v2" requires additional imports (internal)
+         factory = GithubMetricModuleFactory(
+@@ -419,6 +421,7 @@
+         module_factory_result = factory.get_module()
+         assert importlib.import_module(module_factory_result.module_path) is 
not None
+ 
++    @pytest.mark.skip()
+     @pytest.mark.filterwarnings("ignore:GithubMetricModuleFactory is 
deprecated:FutureWarning")
+     def test_GithubMetricModuleFactory_with_external_import(self):
+         # "bleu" requires additional imports (external from github)
+@@ -1032,6 +1035,7 @@
+         datasets.load_dataset_builder(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, 
"non-existing-config")
+ 
+ 
[email protected]()
+ @pytest.mark.parametrize("serializer", [pickle, dill])
+ def test_load_dataset_builder_with_metadata_configs_pickable(serializer):
+     builder = 
datasets.load_dataset_builder(SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA)
+@@ -1153,6 +1157,7 @@
+     assert len(builder.config.data_files["test"]) > 0
+ 
+ 
[email protected]()
+ def test_load_dataset_builder_fail():
+     with pytest.raises(DatasetNotFoundError):
+         datasets.load_dataset_builder("blabla")
+@@ -1168,6 +1173,7 @@
+     assert isinstance(next(iter(dataset["train"])), dict)
+ 
+ 
[email protected]()
+ def test_load_dataset_cached_local_script(dataset_loading_script_dir, 
data_dir, caplog):
+     dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir)
+     assert isinstance(dataset, DatasetDict)
 --- a/tests/test_hf_gcp.py     2024-02-21 09:59:26.918397895 +0100
 +++ b/tests/test_hf_gcp.py     2024-02-21 09:59:46.335100597 +0100
-@@ -21,7 +21,6 @@
-     {"dataset": "wikipedia", "config_name": "20220301.frr"},
-     {"dataset": "wikipedia", "config_name": "20220301.it"},
-     {"dataset": "wikipedia", "config_name": "20220301.simple"},
--    {"dataset": "eli5", "config_name": "LFQA_reddit"},
-     {"dataset": "wiki40b", "config_name": "en"},
-     {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.compressed"},
-     {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.no_index"},
+@@ -47,6 +47,7 @@
+         ]
+ 
+ 
[email protected]("network")
+ 
@parameterized.named_parameters(list_datasets_on_hf_gcp_parameters(with_config=True))
+ class TestDatasetOnHfGcp(TestCase):
+     dataset = None
 --- a/tests/test_inspect.py    2024-02-21 10:03:32.315520016 +0100
 +++ b/tests/test_inspect.py    2024-02-21 10:03:50.345553490 +0100
 @@ -18,7 +18,7 @@
@@ -66,24 +87,47 @@
  def test_inspect_dataset(path, tmp_path):
      inspect_dataset(path, tmp_path)
      script_name = Path(path).stem + ".py"
---- a/tests/packaged_modules/test_cache.py     2024-02-21 12:04:18.036866572 
+0100
-+++ b/tests/packaged_modules/test_cache.py     2024-02-21 12:04:54.333558520 
+0100
-@@ -44,18 +44,3 @@
-         Cache(dataset_name=text_dir.name, 
hash="missing").download_and_prepare()
-     with pytest.raises(ValueError):
-         Cache(dataset_name=text_dir.name, config_name="missing", 
version="auto", hash="auto").download_and_prepare()
--
--
[email protected]
--def test_cache_multi_configs():
--    repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA
--    dataset_name = repo_id.split("/")[-1]
--    config_name = "v1"
--    ds = load_dataset(repo_id, config_name)
--    cache = Cache(dataset_name=dataset_name, repo_id=repo_id, 
config_name=config_name, version="auto", hash="auto")
--    reloaded = cache.as_dataset()
--    assert list(ds) == list(reloaded)
--    assert len(ds["train"]) == len(reloaded["train"])
--    with pytest.raises(ValueError) as excinfo:
--        Cache(dataset_name=dataset_name, repo_id=repo_id, 
config_name="missing", version="auto", hash="auto")
--    assert config_name in str(excinfo.value)
+@@ -49,6 +49,7 @@
+     assert list(info.splits.keys()) == expected_splits
+ 
+ 
[email protected](reason="require network")
+ def test_get_dataset_config_info_private(hf_token, 
hf_private_dataset_repo_txt_data):
+     info = get_dataset_config_info(hf_private_dataset_repo_txt_data, 
config_name="default", token=hf_token)
+     assert list(info.splits.keys()) == ["train"]
+--- a/tests/test_data_files.py 2024-02-21 20:22:57.536160356 +0100
++++ b/tests/test_data_files.py 2024-02-21 20:25:00.153052174 +0100
+@@ -378,6 +378,7 @@
+         assert len(hub_dataset_repo_patterns_results[pattern]) == 0
+ 
+ 
[email protected](reason="network")
+ def 
test_DataFilesList_from_patterns_locally_with_extra_files(complex_data_dir, 
text_file):
+     data_files_list = DataFilesList.from_patterns([_TEST_URL, 
text_file.as_posix()], complex_data_dir)
+     assert list(data_files_list) == [_TEST_URL, text_file.as_posix()]
+@@ -467,6 +468,7 @@
+         assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
+ 
+ 
[email protected](reason="network")
+ def test_DataFilesDict_from_patterns_locally_or_remote_hashing(text_file):
+     patterns = {"train": [_TEST_URL], "test": [str(text_file)]}
+     data_files1 = DataFilesDict.from_patterns(patterns)
+--- a/tests/packaged_modules/test_folder_based_builder.py      2024-02-21 
21:30:20.718922523 +0100
++++ b/tests/packaged_modules/test_folder_based_builder.py      2024-02-21 
21:31:46.309061287 +0100
+@@ -382,6 +382,7 @@
+         assert example[column] is not None
+ 
+ 
[email protected](reason="network")
+ @pytest.mark.parametrize("remote", [True, False])
+ @pytest.mark.parametrize("drop_labels", [None, True, False])
+ def test_data_files_with_different_levels_no_metadata(
+@@ -405,6 +406,7 @@
+         assert all(example.keys() == {"base", "label"} for _, example in 
generator)
+ 
+ 
[email protected](reason="network")
+ @pytest.mark.parametrize("remote", [False, True])
+ @pytest.mark.parametrize("drop_labels", [None, True, False])
+ def 
test_data_files_with_one_label_no_metadata(data_files_with_one_label_no_metadata,
 drop_labels, remote, cache_dir):

[gentoo-commits] repo/gentoo:master commit in: sci-libs/datasets/files/, sci-libs/datasets/

Reply via email to