-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate-rss.py
More file actions
163 lines (133 loc) · 5.39 KB
/
generate-rss.py
File metadata and controls
163 lines (133 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python3
"""
Generate RSS 2.0 feed for AI summaries in monthly/aisummary/
"""
from __future__ import annotations
import html
import re
from pathlib import Path
from datetime import datetime
from email.utils import formatdate
def extract_title_and_content(markdown_text: str) -> tuple[str, str]:
"""Extract title (first H1) and content from markdown text."""
lines = markdown_text.strip().split('\n')
title = None
content_lines = []
# Find the first H1 and use it as title
title_found = False
for line in lines:
if line.startswith('# ') and not title_found:
title = line[2:].strip()
title_found = True
elif title_found:
content_lines.append(line)
else:
content_lines.append(line)
# If no H1 found, use the first few words as title
if not title:
first_line = lines[0] if lines else ""
title = first_line[:50] + "..." if len(first_line) > 50 else first_line
content_lines = lines
content = '\n'.join(content_lines).strip()
# Convert markdown to basic HTML for RSS description
# Simple conversion - replace common markdown patterns
content = re.sub(r'^### (.+)$', r'<h3>\1</h3>', content, flags=re.MULTILINE)
content = re.sub(r'^## (.+)$', r'<h2>\1</h2>', content, flags=re.MULTILINE)
content = re.sub(r'^# (.+)$', r'<h1>\1</h1>', content, flags=re.MULTILINE)
content = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', content)
content = re.sub(r'\*(.+?)\*', r'<em>\1</em>', content)
content = re.sub(r'`(.+?)`', r'<code>\1</code>', content)
content = re.sub(r'\[(.+?)\]\((.+?)\)', r'<a href="\2">\1</a>', content)
# Convert line breaks to HTML
content = content.replace('\n\n', '</p><p>')
content = content.replace('\n', '<br>')
if content and not content.startswith('<'):
content = f'<p>{content}</p>'
return title, content
def read_markdown_files(aisummary_dir: Path) -> list[tuple[str, str, str, datetime]]:
"""Return list of (id, title, content, date) sorted descending by date."""
files = sorted(
(p for p in aisummary_dir.glob("*.md") if p.is_file()),
key=lambda p: p.stem,
reverse=True,
)
result: list[tuple[str, str, str, datetime]] = []
for p in files:
try:
text = p.read_text(encoding="utf-8")
except UnicodeDecodeError:
text = p.read_text(errors="replace")
title, content = extract_title_and_content(text)
# Parse date from filename (YYYY-MM-DD format)
try:
date_obj = datetime.strptime(p.stem, "%Y-%m-%d")
except ValueError:
# Fallback to file modification time
date_obj = datetime.fromtimestamp(p.stat().st_mtime)
result.append((p.stem, title, content, date_obj))
return result
def generate_rss_feed(summaries: list[tuple[str, str, str, datetime]], base_url: str = "https://modelmeters.com") -> str:
"""Generate RSS 2.0 XML feed."""
# RSS header
rss_xml = '''<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>Model Meters - AI-generated Monthly Summaries</title>
<link>{base_url}/agent/</link>
<description>AI-generated monthly summaries of Azure AI Foundry pricing changes and updates</description>
<language>en-us</language>
<lastBuildDate>{last_build_date}</lastBuildDate>
<atom:link href="{base_url}/agent/rss.xml" rel="self" type="application/rss+xml"/>
<generator>Model Meters RSS Generator</generator>
<webMaster>guy.gregory@microsoft.com</webMaster>
<managingEditor>guy.gregory@microsoft.com</managingEditor>
<category>Technology</category>
<category>Azure</category>
<category>AI</category>
'''.format(
base_url=base_url,
last_build_date=formatdate(datetime.now().timestamp())
)
# Add items
for file_id, title, content, pub_date in summaries[:20]: # Limit to 20 most recent
# Escape HTML content
escaped_title = html.escape(title)
escaped_content = html.escape(content)
# Create item
item_xml = f'''
<item>
<title>{escaped_title}</title>
<link>{base_url}/agent/#{file_id}</link>
<description>{escaped_content}</description>
<pubDate>{formatdate(pub_date.timestamp())}</pubDate>
<guid isPermaLink="true">{base_url}/agent/#{file_id}</guid>
<category>Azure AI</category>
</item>'''
rss_xml += item_xml
# Close RSS
rss_xml += '''
</channel>
</rss>'''
return rss_xml
def main() -> int:
"""Generate RSS feed for AI summaries."""
repo_root = Path(__file__).resolve().parent
monthly_dir = repo_root / "monthly"
aisummary_dir = monthly_dir / "aisummary"
agent_dir = repo_root / "agent"
if not aisummary_dir.exists():
raise SystemExit(f"Directory not found: {aisummary_dir}")
if not agent_dir.exists():
agent_dir.mkdir(parents=True)
summaries = read_markdown_files(aisummary_dir)
if not summaries:
print("No markdown files found in monthly/aisummary")
return 0
rss_xml = generate_rss_feed(summaries)
# Write RSS feed
rss_file = agent_dir / "rss.xml"
rss_file.write_text(rss_xml, encoding="utf-8")
print(f"Generated RSS feed: {rss_file} with {len(summaries)} items")
return 0
if __name__ == "__main__":
raise SystemExit(main())