argilla-io · frascuchon · Mar 1, 2023 · Feb 28, 2023 · Feb 28, 2023
diff --git a/src/argilla/client/apis/datasets.py b/src/argilla/client/apis/datasets.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import math
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
@@ -163,23 +164,23 @@ def scan(
             name: the dataset
             query: the search query
             projection: a subset of record fields to retrieve. If not provided,
-            limit: The number of records to retrieve
+                only id's will be returned
+            limit: The number of records to retrieve.
             id_from: If provided, starts gathering the records starting from that Record.
                 As the Records returned with the load method are sorted by ID, ´id_from´
                 can be used to load using batches.
-            only id's will be returned
 
         Returns:
-
             An iterable of raw object containing per-record info
-
         """
 
-        url = f"{self._API_PREFIX}/{name}/records/:search?limit={self.DEFAULT_SCAN_SIZE}"
-        query = self._parse_query(query=query)
+        if limit and limit < 0:
+            raise ValueError("The scan limit must be non-negative.")
 
-        if limit == 0:
-            limit = None
+        batch_size = self.DEFAULT_SCAN_SIZE
+        limit = limit if limit else math.inf
+        url = f"{self._API_PREFIX}/{name}/records/:search?limit={{limit}}"
+        query = self._parse_query(query=query)
 
         request = {
             "fields": list(projection) if projection else ["id"],
@@ -189,24 +190,24 @@ def scan(
         if id_from:
             request["next_idx"] = id_from
 
-        yield_fields = 0
         with api_compatibility(self, min_version="1.2.0"):
+            request_limit = min(limit, batch_size)
             response = self.http_client.post(
-                url,
+                url.format(limit=request_limit),
                 json=request,
             )
 
             while response.get("records"):
-                for record in response["records"]:
-                    yield record
-                    yield_fields += 1
-                    if limit and limit <= yield_fields:
-                        return
+                yield from response["records"]
+                limit -= request_limit
+                if limit <= 0:
+                    return
 
                 next_idx = response.get("next_idx")
                 if next_idx:
+                    request_limit = min(limit, batch_size)
                     response = self.http_client.post(
-                        path=url,
+                        path=url.format(limit=request_limit),
                         json={**request, "next_idx": next_idx},
                     )
 

diff --git a/tests/client/functional_tests/test_scan_raw_records.py b/tests/client/functional_tests/test_scan_raw_records.py
@@ -67,3 +67,58 @@ def test_scan_records_without_results(
     )
     data = list(data)
     assert len(data) == 0
+
+
+def test_scan_fail_negative_limit(
+    mocked_client,
+    gutenberg_spacy_ner,
+):
+    with pytest.raises(ValueError, match="limit.*negative"):
+        data = active_api().datasets.scan(
+            name=gutenberg_spacy_ner,
+            limit=-20,
+        )
+        # Actually load the generator its data
+        data = list(data)
+
+
+@pytest.mark.parametrize(("limit"), [6, 23, 20])
+def test_scan_efficient_limiting(
+    monkeypatch: pytest.MonkeyPatch,
+    limit,
+    gutenberg_spacy_ner,
+):
+    client_datasets = active_api().datasets
+    # Reduce the default scan size to something small to better test the situation
+    # where limit > DEFAULT_SCAN_SIZE
+    batch_size = 10
+    monkeypatch.setattr(client_datasets, "DEFAULT_SCAN_SIZE", batch_size)
+
+    # Monkeypatch the .post() call to track with what URLs the server is called
+    called_paths = []
+    original_post = active_api().http_client.post
+
+    def tracked_post(path, *args, **kwargs):
+        called_paths.append(path)
+        return original_post(path, *args, **kwargs)
+
+    monkeypatch.setattr(active_api().http_client, "post", tracked_post)
+
+    # Try to fetch `limit` samples from the 100
+    data = client_datasets.scan(name=gutenberg_spacy_ner, limit=limit)
+    data = list(data)
+
+    # Ensure that `limit` samples were indeed fetched
+    assert len(data) == limit
+    # Ensure that the samples were fetched in the expected number of requests
+    # Equivalent to math.upper(limit / batch_size):
+    assert len(called_paths) == (limit - 1) // batch_size + 1
+
+    if limit % batch_size == 0:
+        # If limit is divisible by batch_size, then we expect all calls to have a limit of batch_size
+        assert all(path.endswith(f"?limit={batch_size}") for path in called_paths)
+    else:
+        # Otherwise, expect all calls except for the last one to have a limit of batch_size
+        # while the last one has limit limit % batch_size
+        assert all(path.endswith(f"?limit={batch_size}") for path in called_paths[:-1])
+        assert called_paths[-1].endswith(f"?limit={limit % batch_size}")