videos: improve post migration script

zubeydecivelek · zubeydecivelek · commit e86eb9bf5b93 · 2025-11-12T14:57:42.000+01:00
diff --git a/cds_migrator_kit/videos/scripts/run_failed_tasks.py b/cds_migrator_kit/videos/scripts/run_failed_tasks.py
@@ -16,9 +16,33 @@
     TranscodeVideoTask,
     sync_records_with_deposit_files,
 )
+from cds.modules.records.utils import parse_video_chapters
 from invenio_db import db
 from invenio_files_rest.models import ObjectVersion, ObjectVersionTag
 
+SUCCESS_LOG_PATH = None
+ERROR_LOG_PATH = None
+
+
+def log_success(message):
+    """Write a success message to the success log file."""
+    if not SUCCESS_LOG_PATH:
+        raise RuntimeError("SUCCESS_LOG_PATH is not initialized.")
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    with open(SUCCESS_LOG_PATH, "a") as log_file:
+        log_file.write(f"[{timestamp}] {message}\n")
+    print(message)
+
+
+def log_error(message):
+    """Write an error message to the error log file."""
+    if not ERROR_LOG_PATH:
+        raise RuntimeError("ERROR_LOG_PATH is not initialized.")
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    with open(ERROR_LOG_PATH, "a") as log_file:
+        log_file.write(f"[{timestamp}] {message}\n")
+    print(message)
+
 
 def copy_master_tags_between_buckets(src_bucket, dst_bucket):
     """Copy tags of the master ObjectVersion from src_bucket to dst_bucket."""
@@ -72,20 +96,22 @@ def run_failed_tasks(failed_tasks, flow, deposit_id, record_id):
         task.status = FlowTaskStatus.PENDING
         db.session.commit()
 
-        print(f"Re-running ExtractMetadataTask for record {record_id}")
+        log_success(f"Re-running ExtractMetadataTask for record {record_id}")
         payload["task_id"] = str(task.id)
 
         celery_task = ExtractMetadataTask()
         celery_task.clean(deposit_id=deposit_id, version_id=payload["version_id"])
         celery_task.s(**payload).apply_async()
         db.session.commit()
-        fetch_tasks_status(flow_id, ExtractMetadataTask.name, timeout_seconds=120)
+        fetch_tasks_status(flow_id, ExtractMetadataTask.name, timeout_seconds=300)
 
         # Remove from failed list so we don't run it twice
         failed_tasks = [t for t in failed_tasks if t[0] != ExtractMetadataTask.name]
 
         db.session.expire_all()
         try:
+            # Make sure it finished
+            fetch_tasks_status(flow_id, ExtractMetadataTask.name, timeout_seconds=100)
             flow = db.session.query(FlowMetadata).get(flow_id)
             deposit = deposit_video_resolver(deposit_id)
             extracted_metadata = deposit["_cds"]["extracted_metadata"]
@@ -98,7 +124,7 @@ def run_failed_tasks(failed_tasks, flow, deposit_id, record_id):
                 dst_bucket=record["_buckets"]["record"],
             )
         except Exception as e:
-            print(f"An error occurred while handling ExtractMetadataTask: {e}")
+            log_error(f"ERROR: ExtractMetadataTask: {e}")
 
     # --- Handle ExtractFramesTask separately ---
     if ExtractFramesTask.name in task_names:
@@ -108,7 +134,7 @@ def run_failed_tasks(failed_tasks, flow, deposit_id, record_id):
         task.status = FlowTaskStatus.PENDING
         db.session.commit()
 
-        print(f"Re-running ExtractFramesTask for record {record_id}")
+        log_success(f"Re-running ExtractFramesTask for record {record_id}")
         payload["task_id"] = str(task.id)
 
         celery_task = ExtractFramesTask()
@@ -124,11 +150,11 @@ def run_failed_tasks(failed_tasks, flow, deposit_id, record_id):
 
     # --- Handle if other task failed ---
     for task_name, task_id in failed_tasks:
-        print(f"Re-running failed task: {task_name} for record {record_id}")
+        log_success(f"Re-running failed task: {task_name} for record {record_id}")
 
         task_cls = _find_celery_task_by_name(task_name)
         if not task_cls:
-            print(f"No Celery task class found for {task_name}. Skipping.")
+            log_success(f"No Celery task class found for {task_name}. Skipping.")
             continue
 
         task = db.session.query(FlowTaskMetadata).get(task_id)
@@ -142,8 +168,6 @@ def run_failed_tasks(failed_tasks, flow, deposit_id, record_id):
         celery_task.s(**payload).apply_async()
         db.session.commit()
 
-    fetch_tasks_status(flow_id, task.name, timeout_seconds=120)
-
 
 def fetch_tasks_status(flow_id, task_name, timeout_seconds=60):
     """Wait for a specific task in a flow to finish (SUCCESS or FAILURE)."""
@@ -152,7 +176,7 @@ def fetch_tasks_status(flow_id, task_name, timeout_seconds=60):
     while True:
         elapsed_time = time.time() - start_time
         if elapsed_time >= timeout_seconds:
-            print(f"Timeout reached after {timeout_seconds} seconds. Exiting.")
+            log_error(f"Timeout reached after {timeout_seconds} seconds. Exiting.")
             break
 
         # Force SQLAlchemy to fetch fresh data from the DB
@@ -173,20 +197,35 @@ def fetch_tasks_status(flow_id, task_name, timeout_seconds=60):
         else:
             # Explicit success/failure logs
             if task.status == FlowTaskStatus.SUCCESS:
-                print(f"✅ Task '{task.name}' completed successfully.")
+                log_success(f"SUCCESS Task '{task.name}' completed successfully.")
             elif task.status == FlowTaskStatus.FAILURE:
-                print(f"❌ Task '{task.name}' failed.")
+                log_error(f"ERROR: Task '{task.name}' failed.")
             else:
-                print(f"ℹTask '{task.name}' finished with status: {task.status.name}.")
+                log_success(
+                    f"Task '{task.name}' finished with status: {task.status.name}."
+                )
             break
 
         time.sleep(5)  # Poll every 5 seconds
 
 
-def finalize_tasks(deposit_id):
+def run_chapters_task(deposit_id, record_id, flow_id):
+    """Run the chapters task for a given deposit and flow."""
+    # Make sure all ExtractMetadataTask and ExtractFramesTask finished
+    fetch_tasks_status(flow_id, ExtractMetadataTask.name, timeout_seconds=60)
+    fetch_tasks_status(flow_id, ExtractFramesTask.name, timeout_seconds=60)
+
     # Always work on a clean session to avoid cached data
     db.session.expire_all()
 
+    # Get the record to see if it has chapters
+    record = record_video_resolver(record_id)
+    description = record.get("description", "")
+    chapters = parse_video_chapters(description)
+    if not chapters:
+        log_success(f"No chapters found for record {record_id}. Skipping chapters.")
+        return
+
     flow = FlowMetadata.get_by_deposit(deposit_id)
     flow_id = flow.id
     payload = flow.payload.copy()
@@ -199,12 +238,16 @@ def finalize_tasks(deposit_id):
             and task.status == FlowTaskStatus.SUCCESS
         ):
             run_chapters_task = False
+        if (
+            task.name == ExtractMetadataTask.name or task.name == ExtractFramesTask.name
+        ) and task.status == FlowTaskStatus.FAILURE:
+            run_chapters_task = False
+            log_error(
+                f"ERROR: ExtractMetadataTask/ExtractFramesTask failed for deposit {deposit_id}. Chapter task will not run."
+            )
 
     if run_chapters_task:
-        #  Wait for ExtractFramesTask to finish to run ExtractChapterFramesTask
-        print("Waiting for ExtractFramesTask to complete...")
-        fetch_tasks_status(flow_id, ExtractFramesTask.name, timeout_seconds=240)
-        print("Running ExtractChapterFramesTask...")
+        log_success("Running ExtractChapterFramesTask...")
 
         # Create a FlowTaskMetadata
         new_task = FlowTaskMetadata(
@@ -218,72 +261,62 @@ def finalize_tasks(deposit_id):
         payload["task_id"] = str(new_task.id)
         ExtractChapterFramesTask().s(**payload).apply_async()
 
-        # Poll for task completion
-        fetch_tasks_status(flow_id, ExtractChapterFramesTask.name, timeout_seconds=240)
-
-
-def fetch_flow_and_log(record_id, deposit_id, flow_id, failed_tasks, log_file_path):
-    """Fetch the latest flow and write detailed info to the log file."""
-    # Ensure we read the latest DB state
-    db.session.expire_all()
-    flow = db.session.query(FlowMetadata).get(flow_id)
-
-    with open(log_file_path, "a") as log_file:
-        log_file.write("\n" + "=" * 80 + "\n")
-        log_file.write(f"Record ID: {record_id}\n")
-        log_file.write(f"Deposit ID: {deposit_id}\n")
-        log_file.write(f"Flow ID: {flow_id}\n")
-        log_file.write("-" * 80 + "\n")
-
-        # Log previously failed tasks
-        if failed_tasks:
-            log_file.write("Previously failed tasks:\n")
-            for task_name, task_id in failed_tasks:
-                log_file.write(f"  - {task_name} (ID: {task_id})\n")
-        else:
-            log_file.write("No previously failed tasks.\n")
-
-        log_file.write("-" * 80 + "\n")
-        log_file.write("Latest task statuses:\n")
-
-        # Iterate all tasks in the flow and log their current statuses
-        for task in flow.tasks:
-            task_obj = db.session.query(FlowTaskMetadata).get(task.id)
-            log_file.write(f"  • {task_obj.name:<30} | Status: {task_obj.status}\n")
-
-        log_file.write("=" * 80 + "\n\n")
+        log_success("ExtractChapterFramesTask started async.")
+    else:
+        log_success("Skipped ExtractChapterFramesTask.")
 
 
 def load_record_ids(redirections_file_path):
     with open(redirections_file_path, "r") as f:
         data = json.load(f)
 
     # Extract all cds_videos_id values
-    record_ids = [item["cds_videos_id"] for item in data]
+    record_ids = [item["cds_videos_recid"] for item in data]
     return record_ids
 
 
 def main():
-    # Create a log file
+    global SUCCESS_LOG_PATH, ERROR_LOG_PATH
+    # Create success and error log files
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    log_file_path = f"/tmp/task_recovery_log_{timestamp}.txt"
-    with open(log_file_path, "w") as log_file:
-        pass
+    SUCCESS_LOG_PATH = f"/tmp/task_recovery_log_success_{timestamp}.txt"
+    ERROR_LOG_PATH = f"/tmp/task_recovery_log_error_{timestamp}.txt"
+
+    with open(SUCCESS_LOG_PATH, "w") as log_file:
+        log_file.write(
+            f"Success Log - Started at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
+        )
+        log_file.write("=" * 80 + "\n\n")
+
+    with open(ERROR_LOG_PATH, "w") as log_file:
+        log_file.write(
+            f"Error Log - Started at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
+        )
+        log_file.write("=" * 80 + "\n\n")
 
     redirections_file_path = "/tmp/record_redirections.json"
     all_record_ids = load_record_ids(redirections_file_path)
-    record_ids = all_record_ids[:100] # any subset
+    record_ids = all_record_ids[:100]  # any subset
     for record_id in record_ids:
+        log_success(f"Processing record {record_id}")
         record = record_video_resolver(record_id)
         deposit_id = record["_deposit"]["id"]
 
         flow, failed_tasks = find_failed_tasks(deposit_id)
         flow_id = flow.id
         if not failed_tasks:
-            print(f"No failed tasks found for record {record_id}.")
+            log_success(f"No failed tasks found for record {record_id}.")
         else:
+            task_names = [task[0] for task in failed_tasks]
+            log_success(f"Failed tasks: {task_names}")
             run_failed_tasks(failed_tasks, flow, deposit_id, record_id)
 
-        finalize_tasks(deposit_id)
+    # After all records are processed for metadata and frames we can run the chapters task
+    for record_id in record_ids:
+        log_success(f"Processing record {record_id}")
+        record = record_video_resolver(record_id)
+        deposit_id = record["_deposit"]["id"]
 
-        fetch_flow_and_log(record_id, deposit_id, flow_id, failed_tasks, log_file_path)
+        flow, failed_tasks = find_failed_tasks(deposit_id)
+        flow_id = flow.id
+        run_chapters_task(deposit_id, record_id, flow_id)