From 1d83594a2b821d0fdbdb2203d7446e7dc51edd6d Mon Sep 17 00:00:00 2001
From: Yolanda Robla <info@ysoft.biz>
Date: Tue, 4 Feb 2025 10:36:57 +0100
Subject: [PATCH] feat: group common messages in the same thread

It will look for partialquestions with a timestamp difference less
than 5 seconds, and that share at least 1 common message in the list.
Those will be grouped together

Closes: #694
---
 src/codegate/api/v1_processing.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/codegate/api/v1_processing.py b/src/codegate/api/v1_processing.py
index 68653ab5..9d9f7348 100644
--- a/src/codegate/api/v1_processing.py
+++ b/src/codegate/api/v1_processing.py
@@ -235,9 +235,9 @@ def _clean_secrets_from_message(message: str) -> str:
     return pattern.sub("REDACTED_SECRET", message)
 
 
-def _group_partial_messages(
+def _group_partial_messages(  # noqa: C901
     pq_list: List[PartialQuestions],
-) -> List[List[PartialQuestions]]:  # noqa: C901
+) -> List[List[PartialQuestions]]:
     """
     A PartialQuestion is an object that contains several user messages provided from a
     chat conversation. Example:
@@ -272,9 +272,7 @@ def _group_partial_messages(
         # (If sup's messages == sub's messages, that also counts, because sub ⊆ sup)
         possible_subsets: List[PartialQuestions] = []
         for sub in pq_list_sorted:
-            if sub.message_id == sup.message_id:
-                continue
-            if sub.message_id in used:
+            if sub.message_id == sup.message_id or sub.message_id in used:
                 continue
             if (
                 set(sub.messages).issubset(set(sup.messages))
@@ -283,10 +281,23 @@ def _group_partial_messages(
             ):
                 possible_subsets.append(sub)
 
-        # 3) If there are no subsets, this sup stands alone
+        # 3) If there are no subsets, check for time-based grouping
         if not possible_subsets:
-            groups.append([sup])
+            new_group = [sup]
             used.add(sup.message_id)
+
+            for other in pq_list_sorted:
+                if other.message_id in used or other.message_id == sup.message_id:
+                    continue
+                if abs((other.timestamp - sup.timestamp).total_seconds()) <= 5 and set(
+                    other.messages
+                ) & set(
+                    sup.messages
+                ):  # At least one message in common
+                    new_group.append(other)
+                    used.add(other.message_id)
+
+            groups.append(new_group)
         else:
             # 4) Group subsets by messages to discard duplicates e.g.: 2 subsets with single 'hello'
             subs_group_by_messages = defaultdict(list)