"""
Deadline Extractor
Extracts deadlines and due dates from course syllabi and assignment descriptions.
Uses pattern matching and NLP techniques to identify dates.
"""
import re
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import calendar
class DeadlineExtractor:
"""Extract deadlines from text content"""
# Common deadline keywords
DEADLINE_KEYWORDS = [
'due', 'deadline', 'submit', 'submission', 'hand in', 'turn in',
'complete by', 'finish by', 'ends', 'closes', 'final date'
]
# Month names and abbreviations
MONTHS = {
'january': 1, 'jan': 1,
'february': 2, 'feb': 2,
'march': 3, 'mar': 3,
'april': 4, 'apr': 4,
'may': 5,
'june': 6, 'jun': 6,
'july': 7, 'jul': 7,
'august': 8, 'aug': 8,
'september': 9, 'sep': 9, 'sept': 9,
'october': 10, 'oct': 10,
'november': 11, 'nov': 11,
'december': 12, 'dec': 12
}
def __init__(self, current_year: Optional[int] = None):
"""
Initialize deadline extractor.
Args:
current_year: Current year (defaults to current year)
"""
self.current_year = current_year or datetime.now().year
def extract_deadlines(self, text: str, course_code: str = "") -> List[Dict[str, Any]]:
"""
Extract all deadlines from text.
Args:
text: Text to extract deadlines from
course_code: Course code for context
Returns:
List of deadline dictionaries
"""
deadlines = []
# Split text into lines
lines = text.split('\n')
for i, line in enumerate(lines):
line_lower = line.lower()
# Check if line contains deadline keywords
has_deadline_keyword = any(keyword in line_lower for keyword in self.DEADLINE_KEYWORDS)
if has_deadline_keyword:
# Try to extract date from this line and surrounding context
context = '\n'.join(lines[max(0, i-1):min(len(lines), i+3)])
dates = self.extract_dates(context)
for date_info in dates:
# Try to extract assignment name
assignment_name = self.extract_assignment_name(line, context)
deadlines.append({
'course_code': course_code,
'assignment_name': assignment_name,
'due_date': date_info['date'],
'description': line.strip(),
'confidence': date_info['confidence']
})
# Remove duplicates
deadlines = self.deduplicate_deadlines(deadlines)
return deadlines
def extract_dates(self, text: str) -> List[Dict[str, Any]]:
"""
Extract dates from text using various patterns.
Args:
text: Text to extract dates from
Returns:
List of date dictionaries with confidence scores
"""
dates = []
# Pattern 1: Month Day, Year (e.g., "January 15, 2026")
pattern1 = r'(' + '|'.join(self.MONTHS.keys()) + r')\s+(\d{1,2}),?\s+(\d{4})'
matches = re.finditer(pattern1, text, re.IGNORECASE)
for match in matches:
month_str, day, year = match.groups()
month = self.MONTHS[month_str.lower()]
try:
date = datetime(int(year), month, int(day))
dates.append({'date': date, 'confidence': 0.9})
except ValueError:
pass
# Pattern 2: Month Day (e.g., "January 15")
pattern2 = r'(' + '|'.join(self.MONTHS.keys()) + r')\s+(\d{1,2})(?!\d)'
matches = re.finditer(pattern2, text, re.IGNORECASE)
for match in matches:
month_str, day = match.groups()
month = self.MONTHS[month_str.lower()]
try:
# Assume current year or next year if month has passed
year = self.current_year
date = datetime(year, month, int(day))
if date < datetime.now():
date = datetime(year + 1, month, int(day))
dates.append({'date': date, 'confidence': 0.7})
except ValueError:
pass
# Pattern 3: MM/DD/YYYY or MM-DD-YYYY
pattern3 = r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})'
matches = re.finditer(pattern3, text)
for match in matches:
month, day, year = match.groups()
try:
date = datetime(int(year), int(month), int(day))
dates.append({'date': date, 'confidence': 0.8})
except ValueError:
pass
# Pattern 4: YYYY-MM-DD (ISO format)
pattern4 = r'(\d{4})-(\d{1,2})-(\d{1,2})'
matches = re.finditer(pattern4, text)
for match in matches:
year, month, day = match.groups()
try:
date = datetime(int(year), int(month), int(day))
dates.append({'date': date, 'confidence': 0.95})
except ValueError:
pass
# Pattern 5: Day Month Year (e.g., "15 January 2026")
pattern5 = r'(\d{1,2})\s+(' + '|'.join(self.MONTHS.keys()) + r')\s+(\d{4})'
matches = re.finditer(pattern5, text, re.IGNORECASE)
for match in matches:
day, month_str, year = match.groups()
month = self.MONTHS[month_str.lower()]
try:
date = datetime(int(year), month, int(day))
dates.append({'date': date, 'confidence': 0.9})
except ValueError:
pass
return dates
def extract_assignment_name(self, line: str, context: str) -> str:
"""
Extract assignment name from line or context.
Args:
line: Current line
context: Surrounding context
Returns:
Assignment name
"""
# Common assignment patterns
patterns = [
r'(assignment\s+\d+)',
r'(homework\s+\d+)',
r'(hw\s+\d+)',
r'(quiz\s+\d+)',
r'(test\s+\d+)',
r'(exam\s+\d+)',
r'(midterm\s+\d*)',
r'(final\s+exam)',
r'(project\s+\d*)',
r'(lab\s+\d+)',
r'(problem\s+set\s+\d+)',
r'(ps\s+\d+)'
]
for pattern in patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
return match.group(1).title()
# If no pattern found, try to extract from beginning of line
words = line.strip().split()
if len(words) > 0:
# Take first few words as assignment name
name_words = []
for word in words[:5]:
if any(keyword in word.lower() for keyword in self.DEADLINE_KEYWORDS):
break
name_words.append(word)
if name_words:
return ' '.join(name_words)
return "Assignment"
def deduplicate_deadlines(self, deadlines: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Remove duplicate deadlines.
Args:
deadlines: List of deadline dictionaries
Returns:
Deduplicated list
"""
seen = set()
unique_deadlines = []
for deadline in deadlines:
# Create a unique key based on course, assignment, and date
key = (
deadline['course_code'],
deadline['assignment_name'].lower(),
deadline['due_date'].date()
)
if key not in seen:
seen.add(key)
unique_deadlines.append(deadline)
# Sort by due date
unique_deadlines.sort(key=lambda x: x['due_date'])
return unique_deadlines
def extract_from_syllabus(self, syllabus_html: str, course_code: str) -> List[Dict[str, Any]]:
"""
Extract deadlines from course syllabus HTML.
Args:
syllabus_html: HTML content of syllabus
course_code: Course code
Returns:
List of deadline dictionaries
"""
# Remove HTML tags
text = re.sub(r'<[^>]+>', ' ', syllabus_html)
# Decode HTML entities
text = text.replace(' ', ' ')
text = text.replace('&', '&')
text = text.replace('<', '<')
text = text.replace('>', '>')
# Extract deadlines
return self.extract_deadlines(text, course_code)
def extract_from_assignment(self, assignment_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Extract deadline from Quercus assignment data.
Args:
assignment_data: Assignment dictionary from Quercus API
Returns:
Deadline dictionary or None
"""
if 'due_at' not in assignment_data or not assignment_data['due_at']:
return None
try:
due_date = datetime.fromisoformat(assignment_data['due_at'].replace('Z', '+00:00'))
return {
'course_code': assignment_data.get('course_code', ''),
'assignment_name': assignment_data.get('name', 'Assignment'),
'due_date': due_date,
'description': assignment_data.get('description', ''),
'confidence': 1.0 # High confidence since it's from API
}
except Exception as e:
print(f"Failed to parse assignment due date: {str(e)}")
return None