From 99d954b076193e3a71d68b09910d3ce9ca88ab7f Mon Sep 17 00:00:00 2001 From: nedeadinside Date: Thu, 16 Apr 2026 17:03:36 +0700 Subject: [PATCH 1/2] fix: guard model.half() with dtype check in all rerankers Calling .half() on an already-FP16 model raises an error, causing rerankers with use_fp16=True to crash on every request after the first. Add a dtype guard so the conversion only runs when needed: if self.use_fp16 and next(self.model.parameters()).dtype != torch.float16: self.model.half() Fixes BaseReranker, BaseLLMReranker, LightweightLLMReranker, LayerWiseLLMReranker, and MatroyshkaReranker. Co-Authored-By: Claude Sonnet 4.6 --- FlagEmbedding/inference/reranker/decoder_only/base.py | 3 ++- FlagEmbedding/inference/reranker/decoder_only/layerwise.py | 3 ++- FlagEmbedding/inference/reranker/decoder_only/lightweight.py | 3 ++- FlagEmbedding/inference/reranker/encoder_only/base.py | 3 ++- research/Matroyshka_reranker/inference/rank_model.py | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/FlagEmbedding/inference/reranker/decoder_only/base.py b/FlagEmbedding/inference/reranker/decoder_only/base.py index 4d5b26ec6..7fcbb0fab 100644 --- a/FlagEmbedding/inference/reranker/decoder_only/base.py +++ b/FlagEmbedding/inference/reranker/decoder_only/base.py @@ -297,7 +297,8 @@ def compute_score_single_gpu( device = self.target_devices[0] if device == "cpu": self.use_fp16 = False - if self.use_fp16: self.model.half() + if self.use_fp16 and next(self.model.parameters()).dtype != torch.float16: + self.model.half() self.model.to(device) self.model.eval() diff --git a/FlagEmbedding/inference/reranker/decoder_only/layerwise.py b/FlagEmbedding/inference/reranker/decoder_only/layerwise.py index 4b75da36a..410c55acb 100644 --- a/FlagEmbedding/inference/reranker/decoder_only/layerwise.py +++ b/FlagEmbedding/inference/reranker/decoder_only/layerwise.py @@ -179,7 +179,8 @@ def compute_score_single_gpu( device = self.target_devices[0] if device == "cpu": self.use_fp16 = False - if self.use_fp16: self.model.half() + if self.use_fp16 and next(self.model.parameters()).dtype != torch.float16: + self.model.half() self.model.to(device) self.model.eval() diff --git a/FlagEmbedding/inference/reranker/decoder_only/lightweight.py b/FlagEmbedding/inference/reranker/decoder_only/lightweight.py index 000478afb..297d57046 100644 --- a/FlagEmbedding/inference/reranker/decoder_only/lightweight.py +++ b/FlagEmbedding/inference/reranker/decoder_only/lightweight.py @@ -258,7 +258,8 @@ def compute_score_single_gpu( device = self.target_devices[0] if device == "cpu": self.use_fp16 = False - if self.use_fp16: self.model.half() + if self.use_fp16 and next(self.model.parameters()).dtype != torch.float16: + self.model.half() self.model.to(device) self.model.eval() diff --git a/FlagEmbedding/inference/reranker/encoder_only/base.py b/FlagEmbedding/inference/reranker/encoder_only/base.py index 1a4d8b6a4..51ada7550 100644 --- a/FlagEmbedding/inference/reranker/encoder_only/base.py +++ b/FlagEmbedding/inference/reranker/encoder_only/base.py @@ -111,7 +111,8 @@ def compute_score_single_gpu( device = self.target_devices[0] if device == "cpu": self.use_fp16 = False - if self.use_fp16: self.model.half() + if self.use_fp16 and next(self.model.parameters()).dtype != torch.float16: + self.model.half() self.model.to(device) self.model.eval() diff --git a/research/Matroyshka_reranker/inference/rank_model.py b/research/Matroyshka_reranker/inference/rank_model.py index bbdca4ce9..2cd1c0636 100644 --- a/research/Matroyshka_reranker/inference/rank_model.py +++ b/research/Matroyshka_reranker/inference/rank_model.py @@ -203,7 +203,8 @@ def compute_score_single_gpu( device = self.target_devices[0] if device == "cpu": self.use_fp16 = False - if self.use_fp16: self.model.half() + if self.use_fp16 and next(self.model.parameters()).dtype != torch.float16: + self.model.half() self.model.to(device) self.model.eval() From afe28adafda137ab4a8309de9fc92859ebdaff55 Mon Sep 17 00:00:00 2001 From: Matthew Bondarev <110976029+nedeadinside@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:34:50 +0700 Subject: [PATCH 2/2] refactor: replace self.use_fp16 mutation with local fp16 flag Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- FlagEmbedding/inference/reranker/encoder_only/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FlagEmbedding/inference/reranker/encoder_only/base.py b/FlagEmbedding/inference/reranker/encoder_only/base.py index 51ada7550..87e42b9ec 100644 --- a/FlagEmbedding/inference/reranker/encoder_only/base.py +++ b/FlagEmbedding/inference/reranker/encoder_only/base.py @@ -110,8 +110,8 @@ def compute_score_single_gpu( if device is None: device = self.target_devices[0] - if device == "cpu": self.use_fp16 = False - if self.use_fp16 and next(self.model.parameters()).dtype != torch.float16: + use_fp16 = self.use_fp16 and device != "cpu" + if use_fp16 and next(self.model.parameters()).dtype != torch.float16: self.model.half() self.model.to(device)