Skip to content

Commit dfdfb91

Browse files
committed
\
Fix standalone year-only copyright detection without holder Signed-off-by: dikshaa2909 <dikshadeware@gmail.com>
1 parent 022ddc8 commit dfdfb91

1 file changed

Lines changed: 53 additions & 32 deletions

File tree

src/cluecode/copyrights.py

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,9 @@ def detect(self,
326326
'IS', 'HELD',
327327
])
328328

329-
# then walk the parse parse_tree, collecting copyrights, years and authors
329+
# walk parse tree
330330
for tree_node in parse_tree:
331+
331332
if not isinstance(tree_node, Tree):
332333
if TRACE:
333334
logger_debug(f'CopyrightDetector: parse_tree node: {tree_node}')
@@ -336,6 +337,7 @@ def detect(self,
336337
tree_node_label = tree_node.label
337338

338339
if (include_copyrights or include_holders) and 'COPYRIGHT' in tree_node_label:
340+
339341
copyrght = build_detection_from_node(
340342
node=tree_node,
341343
cls=CopyrightDetection,
@@ -344,40 +346,41 @@ def detect(self,
344346
refiner=refine_copyright,
345347
)
346348

347-
if TRACE or TRACE_DEEP:
348-
logger_debug(f'CopyrightDetector: final copyright: {copyrght}')
349+
if not copyrght:
350+
continue
349351

350-
if copyrght:
351-
if include_copyrights:
352-
yield copyrght
352+
holder = None
353+
if include_holders:
354+
holder = build_detection_from_node(
355+
node=tree_node,
356+
cls=HolderDetection,
357+
ignored_labels=non_holder_labels,
358+
refiner=refine_holder,
359+
)
353360

354-
if include_holders:
355-
# By default we strip email and urls from holders ....
361+
if not holder:
356362
holder = build_detection_from_node(
357363
node=tree_node,
358364
cls=HolderDetection,
359-
ignored_labels=non_holder_labels,
365+
ignored_labels=non_holder_labels_mini,
360366
refiner=refine_holder,
361367
)
362368

363-
if not holder:
364-
# ... but if we have no holder, we try again and
365-
# this time we keep email and URLs for holders using
366-
# "non_holder_labels_mini" as an "ignores" label set
367-
holder = build_detection_from_node(
368-
node=tree_node,
369-
cls=HolderDetection,
370-
ignored_labels=non_holder_labels_mini,
371-
refiner=refine_holder,
372-
)
369+
if (
370+
copyrght.copyright
371+
and re.match(r'^Copyright\s+\d{4}$', copyrght.copyright.strip())
372+
and not holder
373+
):
374+
continue
373375

374-
if holder:
375-
if TRACE:
376-
logger_debug(f'CopyrightDetector: holders: {holder}')
376+
if include_copyrights:
377+
yield copyrght
377378

378-
yield holder
379+
if include_holders and holder:
380+
yield holder
379381

380382
elif include_authors and tree_node_label == 'AUTHOR':
383+
381384
author = build_detection_from_node(
382385
node=tree_node,
383386
cls=AuthorDetection,
@@ -388,10 +391,9 @@ def detect(self,
388391
if author:
389392
if TRACE:
390393
logger_debug(f'CopyrightDetector: detected authors: {author}')
391-
392394
yield author
393395

394-
396+
395397
def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
396398
"""
397399
Return an iterable of pygmars.Token built from a ``numbered_lines`` iterable
@@ -3559,6 +3561,7 @@ def refine_copyright(c):
35593561
c = strip_suffixes(c, suffixes=COPYRIGHTS_SUFFIXES)
35603562
c = strip_trailing_period(c)
35613563
c = c.strip("'")
3564+
c = re.sub(r'\b(\d{4})-\$', r'\1', c)
35623565
return c.strip()
35633566

35643567

@@ -3614,14 +3617,9 @@ def refine_holder(h):
36143617

36153618

36163619
def refine_author(a):
3617-
"""
3618-
Refine a detected author.
3619-
FIXME: the grammar should not allow this to happen.
3620-
"""
36213620
if not a:
36223621
return
3623-
# FIXME: we could consider to split comma separated lists such as
3624-
# gthomas, sorin@netappi.com, andrew.lunn@ascom.che.g.
3622+
36253623
a = remove_some_extra_words_and_punct(a)
36263624
a = refine_names(a, prefixes=AUTHORS_PREFIXES)
36273625
a = a.strip()
@@ -3633,7 +3631,30 @@ def refine_author(a):
36333631
a = refine_names(a, prefixes=AUTHORS_PREFIXES)
36343632
a = a.strip()
36353633
a = a.strip('+-')
3636-
if a and a.lower() not in AUTHORS_JUNK and not a.startswith(AUTHORS_JUNK_PREFIX):
3634+
3635+
if not a:
3636+
return
3637+
3638+
al = a.lower()
3639+
3640+
# Reject obvious URLs
3641+
if (
3642+
al.startswith("http")
3643+
or "github.com" in al
3644+
or "www." in al
3645+
or "://" in al
3646+
):
3647+
return
3648+
3649+
# Reject lines without letters
3650+
if not any(c.isalpha() for c in a):
3651+
return
3652+
3653+
# Reject very long non-name strings
3654+
if len(a.split()) > 8:
3655+
return
3656+
3657+
if al not in AUTHORS_JUNK and not a.startswith(AUTHORS_JUNK_PREFIX):
36373658
return a
36383659

36393660

0 commit comments

Comments
 (0)