Skip to main content
Glama

analyze_salary_by_location_and_skills

Analyze salary data by location and skills to identify compensation trends and inform career decisions.

Instructions

Analyze salary statistics by location and skills combination.

Args: file_path: Path to the data file salary_column: Column name containing salary information location_column: Column name containing location information skills_column: Column name containing comma-separated skills output_path: Optional path to save the analysis results

Returns: Salary analysis by location and skills

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
file_pathYes
salary_columnYes
location_columnYes
skills_columnYes
output_pathNo

Implementation Reference

  • Complete implementation of analyze_salary_by_location_and_skills tool. This function loads data from a file, validates salary/location/skills columns, cleans salary data by extracting numeric values using regex, analyzes salary statistics by location and skills (mean, median, min, max, std), and optionally saves results to an output file.
    def analyze_salary_by_location_and_skills(file_path: str, salary_column: str, location_column: str, 
                                            skills_column: str, output_path: Optional[str] = None) -> str:
        """
        Analyze salary statistics by location and skills combination.
        
        Args:
            file_path: Path to the data file
            salary_column: Column name containing salary information
            location_column: Column name containing location information
            skills_column: Column name containing comma-separated skills
            output_path: Optional path to save the analysis results
        
        Returns:
            Salary analysis by location and skills
        """
        try:
            import pandas as pd
            from pathlib import Path
            from collections import defaultdict
            import re
            
            # Load the data
            file_extension = Path(file_path).suffix.lower()
            if file_extension == '.csv':
                df = pd.read_csv(file_path)
            elif file_extension == '.json':
                df = pd.read_json(file_path)
            elif file_extension in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path)
            elif file_extension == '.tsv':
                df = pd.read_csv(file_path, sep='\t')
            else:
                df = pd.read_csv(file_path)
            
            # Validate columns exist
            for col in [salary_column, location_column, skills_column]:
                if col not in df.columns:
                    return f"Error: Column '{col}' not found in data"
            
            # Clean and convert salary data
            def extract_salary(salary_str):
                if pd.isna(salary_str):
                    return None
                # Extract numbers from salary string (handle ranges by taking average)
                numbers = re.findall(r'\d+(?:,\d{3})*(?:\.\d+)?', str(salary_str))
                if len(numbers) == 0:
                    return None
                elif len(numbers) == 1:
                    return float(numbers[0].replace(',', ''))
                else:
                    # Take average of range
                    nums = [float(n.replace(',', '')) for n in numbers]
                    return sum(nums) / len(nums)
            
            df['salary_numeric'] = df[salary_column].apply(extract_salary)
            
            # Filter out rows with missing data
            df_clean = df.dropna(subset=['salary_numeric', location_column, skills_column])
            
            if len(df_clean) == 0:
                return "Error: No valid data rows found after cleaning"
            
            # Analyze by location
            location_analysis = []
            for location in df_clean[location_column].unique():
                location_data = df_clean[df_clean[location_column] == location]
                
                salaries = location_data['salary_numeric']
                location_stats = {
                    "location": location,
                    "job_count": len(location_data),
                    "avg_salary": round(salaries.mean(), 2),
                    "median_salary": round(salaries.median(), 2),
                    "min_salary": round(salaries.min(), 2),
                    "max_salary": round(salaries.max(), 2),
                    "std_salary": round(salaries.std(), 2)
                }
                location_analysis.append(location_stats)
            
            # Sort by average salary
            location_analysis.sort(key=lambda x: x["avg_salary"], reverse=True)
            
            # Analyze by top skills
            skill_salary_data = defaultdict(list)
            
            for _, row in df_clean.iterrows():
                skills_str = row[skills_column]
                salary = row['salary_numeric']
                
                if pd.isna(skills_str):
                    continue
                    
                skills = [skill.strip() for skill in str(skills_str).split(',') if skill.strip()]
                for skill in skills:
                    skill_salary_data[skill].append(salary)
            
            # Calculate skill statistics (only for skills with enough data points)
            skill_analysis = []
            for skill, salaries in skill_salary_data.items():
                if len(salaries) >= 5:  # At least 5 data points
                    skill_stats = {
                        "skill": skill,
                        "job_count": len(salaries),
                        "avg_salary": round(sum(salaries) / len(salaries), 2),
                        "median_salary": round(sorted(salaries)[len(salaries)//2], 2),
                        "min_salary": round(min(salaries), 2),
                        "max_salary": round(max(salaries), 2)
                    }
                    skill_analysis.append(skill_stats)
            
            # Sort by average salary
            skill_analysis.sort(key=lambda x: x["avg_salary"], reverse=True)
            
            # Save analysis if output path provided
            if output_path:
                analysis_data = {
                    "location_analysis": location_analysis,
                    "skill_analysis": skill_analysis[:50]  # Top 50 skills
                }
                
                if output_path.endswith('.json'):
                    with open(output_path, 'w') as f:
                        json.dump(analysis_data, f, indent=2)
                else:
                    # Create separate sheets for locations and skills
                    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                        pd.DataFrame(location_analysis).to_excel(writer, sheet_name='Locations', index=False)
                        pd.DataFrame(skill_analysis).to_excel(writer, sheet_name='Skills', index=False)
            
            result = {
                "salary_analysis_completed": True,
                "locations_analyzed": len(location_analysis),
                "skills_analyzed": len(skill_analysis),
                "total_jobs_analyzed": len(df_clean),
                "top_paying_locations": location_analysis[:10],
                "top_paying_skills": skill_analysis[:15],
                "output_file": output_path if output_path else None
            }
            
            return json.dumps(result, indent=2)
            
        except Exception as e:
            return f"Error analyzing salary by location and skills: {str(e)}\n{traceback.format_exc()}"

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/moeloubani/visidata-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server