bookmark_organizer.py•14.9 kB
#!/usr/bin/env python3
"""
Bookmark and History Organizer
A comprehensive tool to clean, organize, and categorize bookmarks and browser history.
"""
import os
import re
import json
import csv
import sqlite3
import html
from datetime import datetime, timedelta
from urllib.parse import urlparse, parse_qs
from collections import defaultdict, Counter
from typing import List, Dict, Set, Tuple, Optional
import argparse
from pathlib import Path
class BookmarkParser:
"""Parse different bookmark formats"""
@staticmethod
def parse_netscape_html(file_path: str) -> List[Dict]:
"""Parse Netscape bookmark HTML format"""
bookmarks = []
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Extract bookmark links using regex
pattern = r'<A HREF="([^"]+)"[^>]*>([^<]+)</A>'
matches = re.findall(pattern, content)
for url, title in matches:
# Decode HTML entities
title = html.unescape(title)
url = html.unescape(url)
bookmarks.append({
'url': url,
'title': title,
'source': 'bookmarks',
'domain': urlparse(url).netloc,
'added_date': datetime.now().isoformat()
})
return bookmarks
@staticmethod
def parse_csv_history(file_path: str) -> List[Dict]:
"""Parse CSV browser history format"""
history = []
with open(file_path, 'r', encoding='utf-8-sig') as f: # Use utf-8-sig to handle BOM
reader = csv.DictReader(f)
for row in reader:
try:
# Handle BOM in column names
datetime_key = 'DateTime' if 'DateTime' in row else 'DateTime'
history.append({
'url': row['NavigatedToUrl'],
'title': row['PageTitle'],
'source': 'history',
'domain': urlparse(row['NavigatedToUrl']).netloc,
'visit_date': row[datetime_key]
})
except KeyError as e:
print(f"Warning: Missing column {e} in CSV file")
continue
return history
class ContentAnalyzer:
"""Analyze content and categorize URLs"""
def __init__(self):
self.categories = {
'programming': {
'keywords': ['github', 'stackoverflow', 'coding', 'programming', 'developer', 'api', 'documentation', 'tutorial', 'code', 'python', 'javascript', 'java', 'react', 'node', 'git'],
'domains': ['github.com', 'stackoverflow.com', 'developer.mozilla.org', 'docs.python.org', 'nodejs.org', 'reactjs.org']
},
'data_science': {
'keywords': ['data', 'analytics', 'machine learning', 'ai', 'pandas', 'numpy', 'tensorflow', 'pytorch', 'jupyter', 'databricks', 'spark', 'sql', 'database'],
'domains': ['kaggle.com', 'databricks.com', 'pandas.pydata.org', 'numpy.org', 'tensorflow.org', 'pytorch.org']
},
'education': {
'keywords': ['course', 'learn', 'education', 'university', 'college', 'tutorial', 'lesson', 'study', 'academic', 'research'],
'domains': ['coursera.org', 'edx.org', 'udemy.com', 'khanacademy.org', 'mit.edu', 'stanford.edu']
},
'news': {
'keywords': ['news', 'article', 'blog', 'post', 'medium', 'substack', 'newsletter'],
'domains': ['bbc.com', 'cnn.com', 'reuters.com', 'medium.com', 'substack.com']
},
'social': {
'keywords': ['twitter', 'facebook', 'instagram', 'linkedin', 'reddit', 'social', 'community'],
'domains': ['twitter.com', 'x.com', 'facebook.com', 'instagram.com', 'linkedin.com', 'reddit.com']
},
'tools': {
'keywords': ['tool', 'utility', 'converter', 'calculator', 'generator', 'editor', 'design', 'productivity'],
'domains': ['canva.com', 'figma.com', 'notion.so', 'trello.com', 'slack.com']
},
'entertainment': {
'keywords': ['video', 'music', 'game', 'movie', 'entertainment', 'youtube', 'netflix', 'spotify'],
'domains': ['youtube.com', 'netflix.com', 'spotify.com', 'twitch.tv', 'steam.com']
},
'shopping': {
'keywords': ['shop', 'buy', 'store', 'amazon', 'ebay', 'price', 'deal', 'discount'],
'domains': ['amazon.com', 'ebay.com', 'etsy.com', 'shopify.com']
}
}
def categorize_url(self, url: str, title: str = "") -> str:
"""Categorize a URL based on domain and content"""
domain = urlparse(url).netloc.lower()
text_to_analyze = f"{url} {title}".lower()
scores = defaultdict(int)
for category, data in self.categories.items():
# Check domain matches
for cat_domain in data['domains']:
if cat_domain in domain:
scores[category] += 3
# Check keyword matches
for keyword in data['keywords']:
if keyword in text_to_analyze:
scores[category] += 1
if scores:
return max(scores, key=scores.get)
return 'uncategorized'
def extract_keywords(self, url: str, title: str = "") -> List[str]:
"""Extract relevant keywords from URL and title"""
keywords = set()
# Extract from URL path
parsed = urlparse(url)
path_parts = [part for part in parsed.path.split('/') if part and len(part) > 2]
keywords.update(path_parts)
# Extract from title
title_words = re.findall(r'\b\w+\b', title.lower())
keywords.update([word for word in title_words if len(word) > 3])
# Extract from domain
domain_parts = parsed.netloc.split('.')
keywords.update([part for part in domain_parts if len(part) > 2 and part not in ['com', 'org', 'net', 'edu']])
return list(keywords)[:10] # Limit to 10 keywords
class Deduplicator:
"""Remove duplicate and redundant URLs"""
def __init__(self):
self.seen_urls = set()
self.seen_titles = set()
def normalize_url(self, url: str) -> str:
"""Normalize URL for comparison"""
parsed = urlparse(url)
# Remove common tracking parameters
query_params = parse_qs(parsed.query)
filtered_params = {}
for key, value in query_params.items():
if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term', 'fbclid', 'gclid']:
filtered_params[key] = value
# Rebuild URL
new_query = '&'.join([f"{k}={v[0]}" for k, v in filtered_params.items()])
normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
if new_query:
normalized += f"?{new_query}"
return normalized
def is_duplicate(self, url: str, title: str = "") -> bool:
"""Check if URL is a duplicate"""
normalized_url = self.normalize_url(url)
# Check for exact URL match
if normalized_url in self.seen_urls:
return True
# Check for similar titles (fuzzy matching)
if title:
title_lower = title.lower().strip()
for seen_title in self.seen_titles:
if self._similarity(title_lower, seen_title.lower()) > 0.8:
return True
# Add to seen sets
self.seen_urls.add(normalized_url)
if title:
self.seen_titles.add(title.lower().strip())
return False
def _similarity(self, s1: str, s2: str) -> float:
"""Calculate similarity between two strings"""
if not s1 or not s2:
return 0.0
# Simple Jaccard similarity
set1 = set(s1.split())
set2 = set(s2.split())
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union > 0 else 0.0
class BookmarkOrganizer:
"""Main organizer class"""
def __init__(self, data_folder: str = "data"):
self.data_folder = Path(data_folder)
self.parser = BookmarkParser()
self.analyzer = ContentAnalyzer()
self.deduplicator = Deduplicator()
self.all_items = []
def load_data(self):
"""Load all bookmark and history data"""
print("Loading data from files...")
# Load bookmarks
bookmark_file = self.data_folder / "bookmarks_10_26_25.html"
if bookmark_file.exists():
bookmarks = self.parser.parse_netscape_html(str(bookmark_file))
self.all_items.extend(bookmarks)
print(f"Loaded {len(bookmarks)} bookmarks")
# Load history
history_file = self.data_folder / "BrowserHistory_10_26_25.csv"
if history_file.exists():
history = self.parser.parse_csv_history(str(history_file))
self.all_items.extend(history)
print(f"Loaded {len(history)} history entries")
print(f"Total items loaded: {len(self.all_items)}")
def clean_and_deduplicate(self):
"""Remove duplicates and clean data"""
print("Cleaning and deduplicating...")
cleaned_items = []
duplicates_removed = 0
for item in self.all_items:
if not self.deduplicator.is_duplicate(item['url'], item.get('title', '')):
cleaned_items.append(item)
else:
duplicates_removed += 1
self.all_items = cleaned_items
print(f"Removed {duplicates_removed} duplicates")
print(f"Clean items remaining: {len(self.all_items)}")
def categorize_and_tag(self):
"""Categorize items and add keywords"""
print("Categorizing and tagging items...")
for item in self.all_items:
item['category'] = self.analyzer.categorize_url(item['url'], item.get('title', ''))
item['keywords'] = self.analyzer.extract_keywords(item['url'], item.get('title', ''))
def organize_by_category(self) -> Dict[str, List[Dict]]:
"""Organize items by category"""
organized = defaultdict(list)
for item in self.all_items:
organized[item['category']].append(item)
# Sort each category by domain frequency
for category, items in organized.items():
domain_counts = Counter(item['domain'] for item in items)
items.sort(key=lambda x: domain_counts[x['domain']], reverse=True)
return dict(organized)
def generate_report(self, organized_data: Dict[str, List[Dict]]) -> str:
"""Generate a summary report"""
report = []
report.append("=" * 60)
report.append("BOOKMARK & HISTORY ORGANIZATION REPORT")
report.append("=" * 60)
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append("")
# Summary statistics
total_items = sum(len(items) for items in organized_data.values())
report.append(f"Total organized items: {total_items}")
report.append(f"Categories: {len(organized_data)}")
report.append("")
# Category breakdown
report.append("CATEGORY BREAKDOWN:")
report.append("-" * 30)
for category, items in sorted(organized_data.items(), key=lambda x: len(x[1]), reverse=True):
report.append(f"{category.upper()}: {len(items)} items")
# Top domains in this category
domain_counts = Counter(item['domain'] for item in items)
top_domains = domain_counts.most_common(3)
for domain, count in top_domains:
report.append(f" - {domain}: {count} items")
report.append("")
return "\n".join(report)
def save_organized_data(self, organized_data: Dict[str, List[Dict]], output_folder: str = "organized"):
"""Save organized data to files"""
output_path = Path(output_folder)
output_path.mkdir(exist_ok=True)
print(f"Saving organized data to {output_path}...")
# Save each category to separate JSON file
for category, items in organized_data.items():
category_file = output_path / f"{category}.json"
with open(category_file, 'w', encoding='utf-8') as f:
json.dump(items, f, indent=2, ensure_ascii=False)
# Save complete organized data
complete_file = output_path / "complete_organized.json"
with open(complete_file, 'w', encoding='utf-8') as f:
json.dump(organized_data, f, indent=2, ensure_ascii=False)
# Save report
report_file = output_path / "organization_report.txt"
report = self.generate_report(organized_data)
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f"Saved {len(organized_data)} category files")
print(f"Report saved to {report_file}")
def run(self, output_folder: str = "organized"):
"""Run the complete organization process"""
print("Starting bookmark and history organization...")
# Load data
self.load_data()
if not self.all_items:
print("No data found to organize!")
return
# Clean and deduplicate
self.clean_and_deduplicate()
# Categorize and tag
self.categorize_and_tag()
# Organize by category
organized_data = self.organize_by_category()
# Save results
self.save_organized_data(organized_data, output_folder)
# Print summary
print("\n" + self.generate_report(organized_data))
print(f"\nOrganization complete! Check the '{output_folder}' folder for results.")
def main():
parser = argparse.ArgumentParser(description='Organize bookmarks and browser history')
parser.add_argument('--data-folder', default='data', help='Folder containing bookmark and history files')
parser.add_argument('--output', default='organized', help='Output folder for organized data')
args = parser.parse_args()
organizer = BookmarkOrganizer(args.data_folder)
organizer.run(args.output)
if __name__ == "__main__":
main()