Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions docs/job_definition_parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,46 @@ datapoints=["image1.jpg", "image2.jpg"],
contexts=["A cat sitting on a red couch", "A blue car in the rain"]
```

**Length limit:** A context may be at most 400 characters; the backend rejects longer ones. If a context exceeds the limit, a warning is logged at creation time. See `auto_shorten` below to have over-long contexts shortened automatically.

---

### `auto_shorten`

| Property | Value |
|----------|-------|
| **Type** | `bool` |
| **Required** | No |
| **Default** | `False` |

When `True`, any context longer than the 400-character limit is automatically shortened — tuned to the `instruction` so only the part relevant to the question is kept — before upload. When `False` (the default), an over-long context is left unchanged and a warning is logged explaining the backend would reject it.

```python
order = rapi.order.create_classification_order(
name="Outfit check",
instruction="Does the main character wear the right clothing?",
answer_options=["Yes", "No"],
datapoints=["scene.jpg"],
contexts=["<a very long, detailed beach-scene description ...>"],
auto_shorten=True,
)
```

You can also shorten contexts directly via the client, without creating an order:

```python
short = rapi.context.shorten_context(
context="<a very long description ...>",
question="Does the main character wear the right clothing?",
)

# Or a batch of (context, question) pairs in one call:
shortened = rapi.context.shorten_contexts([
(context_a, question_a),
(context_b, question_b),
])
```

---

### `media_contexts`
Expand Down
1 change: 1 addition & 0 deletions src/rapidata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
DeviceFilter,
DeviceType,
Datapoint,
ContextManager,
FailedUploadException,
FailedUpload,
rapidata_config,
Expand Down
3 changes: 3 additions & 0 deletions src/rapidata/rapidata_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
EffortSelection,
)
from .datapoints import Datapoint
from .context import ContextManager
from .datapoints.metadata import (
PrivateTextMetadata,
PublicTextMetadata,
SelectWordsMetadata,
)

# --- GENERATED SETTINGS IMPORTS START ---
from .settings import (
RapidataSettings,
Expand All @@ -48,6 +50,7 @@
CompareEquirectangularSetting,
ClassifyEquirectangularSetting,
)

# --- GENERATED SETTINGS IMPORTS END ---
from .filter import (
CountryFilter,
Expand Down
4 changes: 4 additions & 0 deletions src/rapidata/rapidata_client/context/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .context_manager import ContextManager
from ._context_length import MAX_CONTEXT_LENGTH

__all__ = ["ContextManager", "MAX_CONTEXT_LENGTH"]
82 changes: 82 additions & 0 deletions src/rapidata/rapidata_client/context/_context_length.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from rapidata.rapidata_client.config import logger

if TYPE_CHECKING:
from rapidata.rapidata_client.datapoints._datapoint import Datapoint
from rapidata.rapidata_client.context.context_manager import ContextManager

# Mirrors the backend's datapoint/group context validation
# (datasets-service CreateDatapointCommandValidator: `RuleFor(x => x.Context).MaximumLength(400)`).
# Keep in sync if the backend limit changes.
MAX_CONTEXT_LENGTH = 400


def enforce_context_length(
datapoints: list[Datapoint],
question: str | None,
auto_shorten: bool,
context_manager: ContextManager,
) -> None:
"""Check datapoint contexts against the backend's maximum length, in place.

For every datapoint whose context exceeds :data:`MAX_CONTEXT_LENGTH`:

- if ``auto_shorten`` is True and a ``question`` is available, the context
is shortened for that question (one batched request) and substituted;
- otherwise a warning is logged explaining the backend would reject it.
"""
over_limit = [
(index, datapoint)
for index, datapoint in enumerate(datapoints)
if datapoint.context is not None and len(datapoint.context) > MAX_CONTEXT_LENGTH
]
if not over_limit:
return

if auto_shorten and not question:
# auto_shorten needs the question to tune the context; without it we
# can't shorten, so fall back to warning rather than silently proceed.
logger.warning(
"auto_shorten=True but no question/instruction was available to shorten "
"the context against; leaving %d over-long context(s) unchanged.",
len(over_limit),
)

if auto_shorten and question:
pairs = [
(datapoint.context, question)
for _, datapoint in over_limit
if datapoint.context is not None
]
shortened = context_manager.shorten_contexts(pairs)
for (index, datapoint), new_context in zip(over_limit, shortened):
if not new_context:
logger.warning(
"Datapoint %d: shorten-context returned an empty result; "
"keeping the original context.",
index,
)
continue
assert datapoint.context is not None
logger.info(
"Datapoint %d: shortened context from %d to %d characters.",
index,
len(datapoint.context),
len(new_context),
)
datapoint.context = new_context
return

for index, datapoint in over_limit:
assert datapoint.context is not None
logger.warning(
"Datapoint %d has a context of %d characters, which exceeds the maximum "
"of %d and would be rejected by the backend. Shorten it, or pass "
"auto_shorten=True to shorten it automatically.",
index,
len(datapoint.context),
MAX_CONTEXT_LENGTH,
)
47 changes: 47 additions & 0 deletions src/rapidata/rapidata_client/context/context_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import annotations

from typing import Sequence, TYPE_CHECKING

from rapidata.rapidata_client.config import logger, tracer

if TYPE_CHECKING:
from rapidata.service.openapi_service import OpenAPIService


class ContextManager:
"""Shortens a datapoint's context for the specific question an annotator answers.

A long, general context (e.g. a full scene description) is often far more
detail than a single question needs. This manager tunes a context down to
what is relevant for the question, which keeps it within the length the
backend accepts and focuses the annotator. Results are cached server-side.
"""

def __init__(self, openapi_service: OpenAPIService):
self._openapi_service = openapi_service
logger.debug("ContextManager initialized")

def shorten_context(self, context: str, question: str) -> str:
"""Shorten a single context for the given question.

Args:
context: The (potentially long) context to shorten.
question: The question the context will be shown alongside. The
context is tuned to what this question needs.

Returns:
The shortened context.
"""
return self.shorten_contexts([(context, question)])[0]

def shorten_contexts(self, pairs: Sequence[tuple[str, str]]) -> list[str]:
"""Shorten a batch of ``(context, question)`` pairs in one request.

Args:
pairs: The ``(context, question)`` pairs to shorten.

Returns:
The shortened contexts, in the same order as ``pairs``.
"""
with tracer.start_as_current_span("ContextManager.shorten_contexts"):
return self._openapi_service.context.shorten_contexts(pairs)
Loading
Loading