\

dikshaa2909 · dikshaa2909 · commit dfdfb9139942 · 2026-02-17T19:08:43.000+05:30
Fix standalone year-only copyright detection without holder

Signed-off-by: dikshaa2909 &lt;dikshadeware@gmail.com&gt;
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
@@ -326,8 +326,9 @@ def detect(self,
             'IS', 'HELD',
         ])
 
-        # then walk the parse parse_tree, collecting copyrights, years and authors
+        # walk parse tree
         for tree_node in parse_tree:
+
             if not isinstance(tree_node, Tree):
                 if TRACE:
                     logger_debug(f'CopyrightDetector: parse_tree node: {tree_node}')
@@ -336,6 +337,7 @@ def detect(self,
             tree_node_label = tree_node.label
 
             if (include_copyrights or include_holders) and 'COPYRIGHT' in tree_node_label:
+
                 copyrght = build_detection_from_node(
                     node=tree_node,
                     cls=CopyrightDetection,
@@ -344,40 +346,41 @@ def detect(self,
                     refiner=refine_copyright,
                 )
 
-                if TRACE or TRACE_DEEP:
-                    logger_debug(f'CopyrightDetector: final copyright: {copyrght}')
+                if not copyrght:
+                    continue
 
-                if copyrght:
-                    if include_copyrights:
-                        yield copyrght
+                holder = None
+                if include_holders:
+                    holder = build_detection_from_node(
+                        node=tree_node,
+                        cls=HolderDetection,
+                        ignored_labels=non_holder_labels,
+                        refiner=refine_holder,
+                    )
 
-                    if include_holders:
-                        # By default we strip email and urls from holders ....
+                    if not holder:
                         holder = build_detection_from_node(
                             node=tree_node,
                             cls=HolderDetection,
-                            ignored_labels=non_holder_labels,
+                            ignored_labels=non_holder_labels_mini,
                             refiner=refine_holder,
                         )
 
-                        if not holder:
-                            # ... but if we have no holder, we try again and
-                            # this time we keep email and URLs for holders using
-                            # "non_holder_labels_mini" as an "ignores" label set
-                            holder = build_detection_from_node(
-                                node=tree_node,
-                                cls=HolderDetection,
-                                ignored_labels=non_holder_labels_mini,
-                                refiner=refine_holder,
-                            )
+                if (
+                    copyrght.copyright
+                    and re.match(r'^Copyright\s+\d{4}$', copyrght.copyright.strip())
+                    and not holder
+                ):
+                    continue
 
-                        if holder:
-                            if TRACE:
-                                logger_debug(f'CopyrightDetector: holders: {holder}')
+                if include_copyrights:
+                    yield copyrght
 
-                            yield holder
+                if include_holders and holder:
+                    yield holder
 
             elif include_authors and tree_node_label == 'AUTHOR':
+
                 author = build_detection_from_node(
                     node=tree_node,
                     cls=AuthorDetection,
@@ -388,10 +391,9 @@ def detect(self,
                 if author:
                     if TRACE:
                         logger_debug(f'CopyrightDetector: detected authors: {author}')
-
                     yield author
 
-
+ 
 def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
     """
     Return an iterable of pygmars.Token built from a ``numbered_lines`` iterable
@@ -3559,6 +3561,7 @@ def refine_copyright(c):
     c = strip_suffixes(c, suffixes=COPYRIGHTS_SUFFIXES)
     c = strip_trailing_period(c)
     c = c.strip("'")
+    c = re.sub(r'\b(\d{4})-\$', r'\1', c)
     return c.strip()
 
 
@@ -3614,14 +3617,9 @@ def refine_holder(h):
 
 
 def refine_author(a):
-    """
-    Refine a detected author.
-    FIXME: the grammar should not allow this to happen.
-    """
     if not a:
         return
-    # FIXME: we could consider to split comma separated lists such as
-    # gthomas, sorin@netappi.com, andrew.lunn@ascom.che.g.
+
     a = remove_some_extra_words_and_punct(a)
     a = refine_names(a, prefixes=AUTHORS_PREFIXES)
     a = a.strip()
@@ -3633,7 +3631,30 @@ def refine_author(a):
     a = refine_names(a, prefixes=AUTHORS_PREFIXES)
     a = a.strip()
     a = a.strip('+-')
-    if a and a.lower() not in AUTHORS_JUNK and not a.startswith(AUTHORS_JUNK_PREFIX):
+
+    if not a:
+        return
+
+    al = a.lower()
+
+    # Reject obvious URLs
+    if (
+        al.startswith("http")
+        or "github.com" in al
+        or "www." in al
+        or "://" in al
+    ):
+        return
+
+    # Reject lines without letters
+    if not any(c.isalpha() for c in a):
+        return
+
+    # Reject very long non-name strings
+    if len(a.split()) > 8:
+        return
+
+    if al not in AUTHORS_JUNK and not a.startswith(AUTHORS_JUNK_PREFIX):
         return a