Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions sdk/cosmos/azure-cosmos/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
## Release History

### 4.14.5 (2026-01-15)

#### Bugs Fixed
* Fixed bug where sdk was encountering a timeout issue caused by infinite recursion during the 410 (Gone) error.See [PR 44659](https://github.com/Azure/azure-sdk-for-python/pull/44649)

### 4.14.4 (2026-01-12)

#### Bugs Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@

from collections import deque
import copy
import logging

from ...aio import _retry_utility_async
from ... import http_constants, exceptions

_LOGGER = logging.getLogger(__name__)

# pylint: disable=protected-access

Expand Down Expand Up @@ -145,23 +147,53 @@ async def callback(**kwargs): # pylint: disable=unused-argument
self._client, self._client._global_endpoint_manager, callback, **self._options
)

# Check if this is an internal partition key range fetch - skip 410 retry logic to avoid recursion
# When we call refresh_routing_map_provider(), it triggers _ReadPartitionKeyRanges which would
# come through this same code path. If that also gets a 410 and tries to refresh, we get infinite recursion.
is_pk_range_fetch = self._options.get("_internal_pk_range_fetch", False)
if is_pk_range_fetch:
# For partition key range queries, just execute without 410 partition split retry
# The underlying retry utility will still handle other transient errors
_LOGGER.debug("Partition split retry (async): Skipping 410 retry for internal PK range fetch")
return await execute_fetch()

max_retries = 3
attempt = 0

while attempt <= max_retries:
try:
return await execute_fetch()
except exceptions.CosmosHttpResponseError as e:
if exceptions._partition_range_is_gone(e):
attempt += 1
if attempt > max_retries:
_LOGGER.error(
"Partition split retry (async): Exhausted all %d retries. "
"state: _has_started=%s, _continuation=%s",
max_retries, self._has_started, self._continuation
)
raise # Exhausted retries, propagate error

_LOGGER.warning(
"Partition split retry (async): 410 error (sub_status=%s). Attempt %d of %d. "
"Refreshing routing map and resetting state.",
getattr(e, 'sub_status', 'N/A'),
attempt,
max_retries
)

# Refresh routing map to get new partition key ranges
self._client.refresh_routing_map_provider()
# Reset execution context state to allow retry from the beginning
self._has_started = False
self._continuation = None
# Retry immediately (no backoff needed for partition splits)
continue
raise # Not a partition split error, propagate immediately

# This should never be reached, but added for safety
return []


class _DefaultQueryExecutionContext(_QueryExecutionContextBase):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@

from collections import deque
import copy
import logging
from .. import _retry_utility, http_constants, exceptions

_LOGGER = logging.getLogger(__name__)

# pylint: disable=protected-access

Expand Down Expand Up @@ -143,6 +145,16 @@ def callback(**kwargs): # pylint: disable=unused-argument
self._client, self._client._global_endpoint_manager, callback, **self._options
)

# Check if this is an internal partition key range fetch - skip 410 retry logic to avoid recursion
# When we call refresh_routing_map_provider(), it triggers _ReadPartitionKeyRanges which would
# come through this same code path. If that also gets a 410 and tries to refresh, we get infinite recursion.
is_pk_range_fetch = self._options.get("_internal_pk_range_fetch", False)
if is_pk_range_fetch:
# For partition key range queries, just execute without 410 partition split retry
# The underlying retry utility will still handle other transient errors
_LOGGER.debug("Partition split retry: Skipping 410 retry for internal PK range fetch")
return execute_fetch()

max_retries = 3
attempt = 0

Expand All @@ -153,13 +165,32 @@ def callback(**kwargs): # pylint: disable=unused-argument
if exceptions._partition_range_is_gone(e):
attempt += 1
if attempt > max_retries:
_LOGGER.error(
"Partition split retry: Exhausted all %d retries. "
"state: _has_started=%s, _continuation=%s",
max_retries, self._has_started, self._continuation
)
raise # Exhausted retries, propagate error

_LOGGER.warning(
"Partition split retry: 410 error (sub_status=%s). Attempt %d of %d. "
"Refreshing routing map and resetting state.",
getattr(e, 'sub_status', 'N/A'),
attempt,
max_retries
)

# Refresh routing map to get new partition key ranges
self._client.refresh_routing_map_provider()
# Reset execution context state to allow retry from the beginning
self._has_started = False
self._continuation = None
# Retry immediately (no backoff needed for partition splits)
continue
raise # Not a partition split error, propagate immediately

# This should never be reached, but added for safety
return []
next = __next__ # Python 2 compatibility.


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,15 @@
"""Internal class for partition key range cache implementation in the Azure
Cosmos database service.
"""
import logging
from typing import Any, Optional

from ... import _base
from ..collection_routing_map import CollectionRoutingMap
from .. import routing_range

_LOGGER = logging.getLogger(__name__)

# pylint: disable=protected-access


Expand Down Expand Up @@ -75,18 +78,33 @@ async def init_collection_routing_map_if_needed(
):
collection_routing_map = self._collection_routing_map_by_item.get(collection_id)
if collection_routing_map is None:
# Pass _internal_pk_range_fetch flag to prevent recursive 410 retry logic
# When a 410 partition split error occurs, the SDK calls refresh_routing_map_provider()
# which clears the cache and retries. The retry needs partition key ranges, which calls
# this method, which triggers _ReadPartitionKeyRanges. If that query also goes through
# the 410 retry logic and calls refresh again, we get infinite recursion.
_LOGGER.debug(
"PK range cache (async): Initializing routing map for collection_id=%s with "
"_internal_pk_range_fetch=True to prevent recursive 410 retry.",
collection_id
)
pk_range_kwargs = {**kwargs, "_internal_pk_range_fetch": True}
collection_pk_ranges = [pk async for pk in
self._documentClient._ReadPartitionKeyRanges(collection_link,
feed_options,
**kwargs)]
**pk_range_kwargs)]
# for large collections, a split may complete between the read partition key ranges query page responses,
# causing the partitionKeyRanges to have both the children ranges and their parents. Therefore, we need
# to discard the parent ranges to have a valid routing map.
collection_pk_ranges = PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges)
collection_pk_ranges = list(PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges))
collection_routing_map = CollectionRoutingMap.CompleteRoutingMap(
[(r, True) for r in collection_pk_ranges], collection_id
)
self._collection_routing_map_by_item[collection_id] = collection_routing_map
_LOGGER.debug(
"PK range cache (async): Cached routing map for collection_id=%s with %d ranges",
collection_id, len(collection_pk_ranges)
)

async def get_range_by_partition_key_range_id(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,16 @@
"""Internal class for partition key range cache implementation in the Azure
Cosmos database service.
"""
import logging
from typing import Any, Optional

from .. import _base
from .collection_routing_map import CollectionRoutingMap
from . import routing_range
from .routing_range import PartitionKeyRange

_LOGGER = logging.getLogger(__name__)


# pylint: disable=protected-access

Expand Down Expand Up @@ -61,17 +64,32 @@ def init_collection_routing_map_if_needed(
):
collection_routing_map = self._collection_routing_map_by_item.get(collection_id)
if not collection_routing_map:
# Pass _internal_pk_range_fetch flag to prevent recursive 410 retry logic
# When a 410 partition split error occurs, the SDK calls refresh_routing_map_provider()
# which clears the cache and retries. The retry needs partition key ranges, which calls
# this method, which triggers _ReadPartitionKeyRanges. If that query also goes through
# the 410 retry logic and calls refresh again, we get infinite recursion.
_LOGGER.debug(
"PK range cache: Initializing routing map for collection_id=%s with "
"_internal_pk_range_fetch=True to prevent recursive 410 retry.",
collection_id
)
pk_range_kwargs = {**kwargs, "_internal_pk_range_fetch": True}
collection_pk_ranges = list(self._documentClient._ReadPartitionKeyRanges(collection_link,
feed_options,
**kwargs))
**pk_range_kwargs))
# for large collections, a split may complete between the read partition key ranges query page responses,
# causing the partitionKeyRanges to have both the children ranges and their parents. Therefore, we need
# to discard the parent ranges to have a valid routing map.
collection_pk_ranges = PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges)
collection_pk_ranges = list(PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges))
collection_routing_map = CollectionRoutingMap.CompleteRoutingMap(
[(r, True) for r in collection_pk_ranges], collection_id
)
self._collection_routing_map_by_item[collection_id] = collection_routing_map
_LOGGER.debug(
"PK range cache: Cached routing map for collection_id=%s with %d ranges",
collection_id, len(collection_pk_ranges)
)

def get_overlapping_ranges(self, collection_link, partition_key_ranges, feed_options, **kwargs):
"""Given a partition key range and a collection, return the list of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin
"""
# pylint: disable=protected-access, too-many-branches
kwargs.pop(_Constants.OperationStartTime, None)
# Pop internal flags that should not be passed to the HTTP layer
kwargs.pop("_internal_pk_range_fetch", None)
connection_timeout = connection_policy.RequestTimeout
connection_timeout = kwargs.pop("connection_timeout", connection_timeout)
read_timeout = connection_policy.ReadTimeout
Expand Down
2 changes: 1 addition & 1 deletion sdk/cosmos/azure-cosmos/azure/cosmos/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

VERSION = "4.14.4"
VERSION = "4.14.5"
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p
"""
# pylint: disable=protected-access, too-many-branches
kwargs.pop(_Constants.OperationStartTime, None)
# Pop internal flags that should not be passed to the HTTP layer
kwargs.pop("_internal_pk_range_fetch", None)
connection_timeout = connection_policy.RequestTimeout
read_timeout = connection_policy.ReadTimeout
connection_timeout = kwargs.pop("connection_timeout", connection_timeout)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class MockedCosmosClientConnection(object):
def __init__(self, partition_key_ranges):
self.partition_key_ranges = partition_key_ranges

def _ReadPartitionKeyRanges(self, collection_link: str, feed_options: Optional[Mapping[str, Any]] = None):
def _ReadPartitionKeyRanges(self, collection_link: str, feed_options: Optional[Mapping[str, Any]] = None, **kwargs):
return self.partition_key_ranges

def setUp(self):
Expand Down
Loading
Loading