analyze_salary_by_location_and_skills
Analyze salary data by location and skills to identify compensation trends and inform career decisions.
Instructions
Analyze salary statistics by location and skills combination.
Args: file_path: Path to the data file salary_column: Column name containing salary information location_column: Column name containing location information skills_column: Column name containing comma-separated skills output_path: Optional path to save the analysis results
Returns: Salary analysis by location and skills
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| file_path | Yes | ||
| salary_column | Yes | ||
| location_column | Yes | ||
| skills_column | Yes | ||
| output_path | No |
Implementation Reference
- src/visidata_mcp/server.py:1318-1460 (handler)Complete implementation of analyze_salary_by_location_and_skills tool. This function loads data from a file, validates salary/location/skills columns, cleans salary data by extracting numeric values using regex, analyzes salary statistics by location and skills (mean, median, min, max, std), and optionally saves results to an output file.
def analyze_salary_by_location_and_skills(file_path: str, salary_column: str, location_column: str, skills_column: str, output_path: Optional[str] = None) -> str: """ Analyze salary statistics by location and skills combination. Args: file_path: Path to the data file salary_column: Column name containing salary information location_column: Column name containing location information skills_column: Column name containing comma-separated skills output_path: Optional path to save the analysis results Returns: Salary analysis by location and skills """ try: import pandas as pd from pathlib import Path from collections import defaultdict import re # Load the data file_extension = Path(file_path).suffix.lower() if file_extension == '.csv': df = pd.read_csv(file_path) elif file_extension == '.json': df = pd.read_json(file_path) elif file_extension in ['.xlsx', '.xls']: df = pd.read_excel(file_path) elif file_extension == '.tsv': df = pd.read_csv(file_path, sep='\t') else: df = pd.read_csv(file_path) # Validate columns exist for col in [salary_column, location_column, skills_column]: if col not in df.columns: return f"Error: Column '{col}' not found in data" # Clean and convert salary data def extract_salary(salary_str): if pd.isna(salary_str): return None # Extract numbers from salary string (handle ranges by taking average) numbers = re.findall(r'\d+(?:,\d{3})*(?:\.\d+)?', str(salary_str)) if len(numbers) == 0: return None elif len(numbers) == 1: return float(numbers[0].replace(',', '')) else: # Take average of range nums = [float(n.replace(',', '')) for n in numbers] return sum(nums) / len(nums) df['salary_numeric'] = df[salary_column].apply(extract_salary) # Filter out rows with missing data df_clean = df.dropna(subset=['salary_numeric', location_column, skills_column]) if len(df_clean) == 0: return "Error: No valid data rows found after cleaning" # Analyze by location location_analysis = [] for location in df_clean[location_column].unique(): location_data = df_clean[df_clean[location_column] == location] salaries = location_data['salary_numeric'] location_stats = { "location": location, "job_count": len(location_data), "avg_salary": round(salaries.mean(), 2), "median_salary": round(salaries.median(), 2), "min_salary": round(salaries.min(), 2), "max_salary": round(salaries.max(), 2), "std_salary": round(salaries.std(), 2) } location_analysis.append(location_stats) # Sort by average salary location_analysis.sort(key=lambda x: x["avg_salary"], reverse=True) # Analyze by top skills skill_salary_data = defaultdict(list) for _, row in df_clean.iterrows(): skills_str = row[skills_column] salary = row['salary_numeric'] if pd.isna(skills_str): continue skills = [skill.strip() for skill in str(skills_str).split(',') if skill.strip()] for skill in skills: skill_salary_data[skill].append(salary) # Calculate skill statistics (only for skills with enough data points) skill_analysis = [] for skill, salaries in skill_salary_data.items(): if len(salaries) >= 5: # At least 5 data points skill_stats = { "skill": skill, "job_count": len(salaries), "avg_salary": round(sum(salaries) / len(salaries), 2), "median_salary": round(sorted(salaries)[len(salaries)//2], 2), "min_salary": round(min(salaries), 2), "max_salary": round(max(salaries), 2) } skill_analysis.append(skill_stats) # Sort by average salary skill_analysis.sort(key=lambda x: x["avg_salary"], reverse=True) # Save analysis if output path provided if output_path: analysis_data = { "location_analysis": location_analysis, "skill_analysis": skill_analysis[:50] # Top 50 skills } if output_path.endswith('.json'): with open(output_path, 'w') as f: json.dump(analysis_data, f, indent=2) else: # Create separate sheets for locations and skills with pd.ExcelWriter(output_path, engine='openpyxl') as writer: pd.DataFrame(location_analysis).to_excel(writer, sheet_name='Locations', index=False) pd.DataFrame(skill_analysis).to_excel(writer, sheet_name='Skills', index=False) result = { "salary_analysis_completed": True, "locations_analyzed": len(location_analysis), "skills_analyzed": len(skill_analysis), "total_jobs_analyzed": len(df_clean), "top_paying_locations": location_analysis[:10], "top_paying_skills": skill_analysis[:15], "output_file": output_path if output_path else None } return json.dumps(result, indent=2) except Exception as e: return f"Error analyzing salary by location and skills: {str(e)}\n{traceback.format_exc()}"