Interface API¶

`osm_powerplants.interface` ¶

OpenStreetMap power plant data interface.

This module provides the main interface for extracting and processing power plant data from OpenStreetMap. It handles country validation, multi-level caching, and data processing.

Main functions: process_units: Simplified entry point for most use cases process_countries: Lower-level function with more options validate_countries: Validate country names with fuzzy matching

`VALID_FUELTYPES = ['Nuclear', 'Solid Biomass', 'Biogas', 'Wind', 'Hydro', 'Solar', 'Oil', 'Natural Gas', 'Hard Coal', 'Lignite', 'Geothermal', 'Waste', 'Other']` `module-attribute` ¶

`VALID_TECHNOLOGIES = ['Steam Turbine', 'OCGT', 'CCGT', 'Run-Of-River', 'Reservoir', 'Pumped Storage', 'Offshore', 'Onshore', 'PV', 'CSP', 'Combustion Engine', 'Marine']` `module-attribute` ¶

`VALID_SETS = ['PP', 'CHP', 'Store']` `module-attribute` ¶

`process_countries(countries, csv_cache_path, cache_dir, update, osm_config, raw=False)` ¶

Process power plant data for specified countries.

Parameters:

Name	Type	Description	Default
`countries`	`list of str`	Country names or ISO codes	required
`csv_cache_path`	`str`	Path to CSV cache file	required
`cache_dir`	`str`	Cache directory path	required
`update`	`bool`	Force cache update if True	required
`osm_config`	`dict`	Configuration dictionary	required
`raw`	`bool`	If True, return all columns including metadata	`False`

Returns:

Type	Description
`DataFrame`	Power plant data

Source code in src/osm_powerplants/interface.py

def process_countries(
    countries, csv_cache_path, cache_dir, update, osm_config, raw=False
):
    """Process power plant data for specified countries.

    Parameters
    ----------
    countries : list of str
        Country names or ISO codes
    csv_cache_path : str
        Path to CSV cache file
    cache_dir : str
        Cache directory path
    update : bool
        Force cache update if True
    osm_config : dict
        Configuration dictionary
    raw : bool, default False
        If True, return all columns including metadata

    Returns
    -------
    pd.DataFrame
        Power plant data
    """
    logger.info(f"Starting country validation for {len(countries)} countries...")

    api_url = osm_config.get("overpass_api", {}).get("api_url")
    current_config_hash = Unit._generate_config_hash(osm_config)
    force_refresh = osm_config.get("force_refresh", False)
    omitted_countries = osm_config.get("omitted_countries", [])
    try:
        valid_countries, country_code_map = validate_countries(
            countries, omitted_countries
        )
    except ValueError as e:
        raise ValueError(
            f"Country validation failed. Cannot proceed with OSM data processing.\n{str(e)}"
        ) from e

    logger.info(
        f"Country validation successful! Processing OSM data for {len(valid_countries)} countries: "
        f"{', '.join(valid_countries[:5])}"
        f"{f' and {len(valid_countries) - 5} more' if len(valid_countries) > 5 else ''}"
    )

    all_valid_data = pd.DataFrame()

    # Create single client for all countries
    client_params = get_client_params(osm_config, api_url, cache_dir)

    with OverpassAPIClient(**client_params) as client:
        for i, country in enumerate(valid_countries, 1):
            logger.info(
                f"Processing country {i}/{len(valid_countries)}: {country} ({country_code_map[country]})"
            )

            country_data = process_single_country(
                country,
                csv_cache_path,
                current_config_hash,
                update,
                force_refresh,
                osm_config,
                client,
            )

            if country_data is not None and not country_data.empty:
                if not raw:
                    country_data = validate_and_standardize_df(
                        country_data,
                        VALID_FUELTYPES,
                        VALID_TECHNOLOGIES,
                        VALID_SETS,
                    )
                all_valid_data = pd.concat(
                    [all_valid_data, country_data], ignore_index=True
                )

    logger.info(f"✅ Successfully processed all {len(valid_countries)} countries")
    return all_valid_data

`process_units(countries, config, cache_dir, output_path=None, raw=True)` ¶

Process power plant data for specified countries.

Parameters:

Name	Type	Description	Default
`countries`	`list[str]`	Country names or ISO codes (e.g., ['Germany', 'FR', 'ESP'])	required
`config`	`dict`	Configuration from get_config()	required
`cache_dir`	`str`	Cache directory from get_cache_dir()	required
`output_path`	`str`	Save CSV to this path if provided	`None`
`raw`	`bool`	If True, return all columns. If False, remove metadata columns (config_hash, created_at, processing_parameters, id).	`True`

Returns:

Type	Description
`DataFrame`	Power plant data with columns: projectID, Name, Country, lat, lon, Fueltype, Technology, Set, Capacity, DateIn, type, capacity_source. When raw=True, also includes metadata columns.

Examples:

>>> from osm_powerplants import process_units, get_config, get_cache_dir
>>> config = get_config()
>>> df = process_units(
...     countries=['Malta', 'Luxembourg'],
...     config=config,
...     cache_dir=str(get_cache_dir(config)),
... )

Source code in src/osm_powerplants/interface.py

def process_units(
    countries: list[str],
    config: dict,
    cache_dir: str,
    output_path: str | None = None,
    raw: bool = True,
) -> pd.DataFrame:
    """Process power plant data for specified countries.

    Parameters
    ----------
    countries : list[str]
        Country names or ISO codes (e.g., ['Germany', 'FR', 'ESP'])
    config : dict
        Configuration from get_config()
    cache_dir : str
        Cache directory from get_cache_dir()
    output_path : str, optional
        Save CSV to this path if provided
    raw : bool, default True
        If True, return all columns. If False, remove metadata columns
        (config_hash, created_at, processing_parameters, id).

    Returns
    -------
    pd.DataFrame
        Power plant data with columns: projectID, Name, Country, lat, lon,
        Fueltype, Technology, Set, Capacity, DateIn, type, capacity_source.
        When raw=True, also includes metadata columns.

    Examples
    --------
    >>> from osm_powerplants import process_units, get_config, get_cache_dir
    >>> config = get_config()
    >>> df = process_units(
    ...     countries=['Malta', 'Luxembourg'],
    ...     config=config,
    ...     cache_dir=str(get_cache_dir(config)),
    ... )
    """
    import os

    csv_cache_path = os.path.join(cache_dir, "osm_data.csv")
    update = config.get("force_refresh", False)

    df = process_countries(
        countries=countries,
        csv_cache_path=csv_cache_path,
        cache_dir=cache_dir,
        update=update,
        osm_config=config,
        raw=raw,
    )

    if output_path and not df.empty:
        df.to_csv(output_path, index=False)

    return df

`validate_countries(countries, omitted_countries=[])` ¶

Validate country names and provide helpful suggestions for invalid entries.

Uses pycountry to validate country names, supporting full names, ISO codes, and common variations. Provides fuzzy matching suggestions for typos or incorrect names. Checks against omitted countries list from configuration.

Parameters:

Name	Type	Description	Default
`countries`	`list of str`	Country names to validate. Accepts: - Full names: 'Germany', 'United States' - ISO 3166-1 alpha-2: 'DE', 'US' - ISO 3166-1 alpha-3: 'DEU', 'USA' - Common variations: 'USA', 'UK', 'South Korea'	required
`omitted_countries`	`list of str`	List of countries to omit from processing	`[]`

Returns:

Name	Type	Description
`valid_countries`	`list of str`	List of validated country names as provided (minus omitted ones)
`country_code_map`	`dict`	Mapping of country names to ISO alpha-2 codes

Raises:

Type	Description
`ValueError`	If any country names are invalid. Error message includes suggestions for similar valid names and shows which entries were valid vs invalid.

Examples:

>>> valid, codes = validate_countries(['Germany', 'France'])
>>> print(codes)
{'Germany': 'DE', 'France': 'FR'}

>>> validate_countries(['Germny'])  # Typo
ValueError: ❌ Invalid country names detected...
     ℹ️  Did you mean: 'Germany', 'Armenia'

Source code in src/osm_powerplants/interface.py

def validate_countries(
    countries: list[str], omitted_countries: list[str] = []
) -> tuple[list[str], dict[str, str]]:
    """Validate country names and provide helpful suggestions for invalid entries.

    Uses pycountry to validate country names, supporting full names, ISO codes,
    and common variations. Provides fuzzy matching suggestions for typos or
    incorrect names. Checks against omitted countries list from configuration.

    Parameters
    ----------
    countries : list of str
        Country names to validate. Accepts:
        - Full names: 'Germany', 'United States'
        - ISO 3166-1 alpha-2: 'DE', 'US'
        - ISO 3166-1 alpha-3: 'DEU', 'USA'
        - Common variations: 'USA', 'UK', 'South Korea'

    omitted_countries : list of str, optional
        List of countries to omit from processing

    Returns
    -------
    valid_countries : list of str
        List of validated country names as provided (minus omitted ones)
    country_code_map : dict
        Mapping of country names to ISO alpha-2 codes

    Raises
    ------
    ValueError
        If any country names are invalid. Error message includes
        suggestions for similar valid names and shows which entries
        were valid vs invalid.

    Examples
    --------
    >>> valid, codes = validate_countries(['Germany', 'France'])
    >>> print(codes)
    {'Germany': 'DE', 'France': 'FR'}

    >>> validate_countries(['Germny'])  # Typo
    ValueError: ❌ Invalid country names detected...
         ℹ️  Did you mean: 'Germany', 'Armenia'
    """
    import pycountry

    countries_to_be_omitted = []
    if omitted_countries:
        # Filter out omitted countries
        filtered_countries = []
        for country in countries:
            if country in omitted_countries:
                logger.info(
                    f"Omitting country '{country}' as specified in configuration (omitted_countries)"
                )
                countries_to_be_omitted.append(country)
            else:
                filtered_countries.append(country)
        countries = filtered_countries

    # If no countries left after omission, return empty results
    if not countries:
        logger.warning(
            "No countries left to process after applying omitted_countries filter"
        )
        return [], {}

    valid_countries = []
    invalid_countries = []
    country_code_map = {}

    for country in countries:
        country_code = get_country_code(country)
        if country_code is not None:
            valid_countries.append(country)
            country_code_map[country] = country_code
        else:
            invalid_countries.append(country)

    if invalid_countries:
        all_country_names = [c.name for c in pycountry.countries]  # type: ignore
        all_country_names.extend([c.alpha_2 for c in pycountry.countries])  # type: ignore
        all_country_names.extend([c.alpha_3 for c in pycountry.countries])  # type: ignore

        country_variations = {
            "USA": "United States",
            "UK": "United Kingdom",
            "South Korea": "Korea, Republic of",
            "North Korea": "Korea, Democratic People's Republic of",
            "Russia": "Russian Federation",
            "Iran": "Iran, Islamic Republic of",
            "Syria": "Syrian Arab Republic",
            "Venezuela": "Venezuela, Bolivarian Republic of",
            "Bolivia": "Bolivia, Plurinational State of",
            "Tanzania": "Tanzania, United Republic of",
            "Vietnam": "Viet Nam",
            "Czech Republic": "Czechia",
            "Macedonia": "North Macedonia",
            "Turkey": "Türkiye",
        }
        all_country_names.extend(country_variations.keys())

        error_parts = [
            f"❌ Invalid country names detected: {len(invalid_countries)} out of {len(countries + countries_to_be_omitted)} countries",
        ]

        if countries_to_be_omitted:
            error_parts.append(
                f"\n⏭️  Omitted countries (from countries_to_be_omitted config): {', '.join(countries_to_be_omitted)}"
            )

        error_parts.append("\nInvalid entries:")

        for invalid in invalid_countries:
            error_parts.append(f"  ❌ '{invalid}'")

            suggestions = get_close_matches(invalid, all_country_names, n=3, cutoff=0.6)
            if suggestions:
                strings = []
                for suggestion in suggestions:
                    strings.append(f"'{suggestion}'")
                error_parts.append(f"     ℹ️  Did you mean: {', '.join(strings)}")

            if invalid in country_variations:
                error_parts.append(
                    f"     ℹ️  Try using: '{country_variations[invalid]}'"
                )

        error_parts.extend(
            [
                "\n✅ Valid entries:",
                *[
                    f"  ✅ '{valid}' → {country_code_map[valid]}"
                    for valid in valid_countries[:5]
                ],
                f"  ... and {len(valid_countries) - 5} more"
                if len(valid_countries) > 5
                else "",
                "\n📝 Accepted formats:",
                "  - Full name: 'Germany', 'United States'",
                "  - ISO 3166-1 alpha-2: 'DE', 'US'",
                "  - ISO 3166-1 alpha-3: 'DEU', 'USA'",
                "  - Common names: 'USA', 'UK', 'South Korea'",
                "\n⚠️  All countries must be valid before processing can begin.",
                "Please correct the invalid entries and try again.",
            ]
        )

        error_msg = "\n".join(filter(None, error_parts))
        logger.error(error_msg)
        raise ValueError(error_msg)

    logger.info(f"✅ Successfully validated all {len(valid_countries)} countries")
    if countries_to_be_omitted:
        logger.info(
            f"ℹ️  Note: {len(countries_to_be_omitted)} countries were omitted per configuration"
        )
    logger.debug(
        f"Country codes: {', '.join(f'{c}={code}' for c, code in country_code_map.items())}"
    )

    return valid_countries, country_code_map

`validate_and_standardize_df(df, valid_fueltypes=None, valid_technologies=None, valid_sets=None)` ¶

Remove metadata columns and validate data.

Removes cache-related columns (config_hash, created_at, etc.) and logs warnings for invalid fuel types, technologies, or sets.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Power plant data	required
`valid_fueltypes`	`list of str`	Allowed fuel types	`None`
`valid_technologies`	`list of str`	Allowed technologies	`None`
`valid_sets`	`list of str`	Allowed set types	`None`

Returns:

Type	Description
`DataFrame`	DataFrame with metadata columns removed

Source code in src/osm_powerplants/interface.py

def validate_and_standardize_df(
    df, valid_fueltypes=None, valid_technologies=None, valid_sets=None
):
    """Remove metadata columns and validate data.

    Removes cache-related columns (config_hash, created_at, etc.) and
    logs warnings for invalid fuel types, technologies, or sets.

    Parameters
    ----------
    df : pd.DataFrame
        Power plant data
    valid_fueltypes : list of str, optional
        Allowed fuel types
    valid_technologies : list of str, optional
        Allowed technologies
    valid_sets : list of str, optional
        Allowed set types

    Returns
    -------
    pd.DataFrame
        DataFrame with metadata columns removed
    """
    if df.empty:
        return df

    if valid_fueltypes is None:
        valid_fueltypes = VALID_FUELTYPES
    if valid_technologies is None:
        valid_technologies = VALID_TECHNOLOGIES
    if valid_sets is None:
        valid_sets = VALID_SETS

    df = df.copy()

    # Remove only cache-related metadata columns
    metadata_columns = [
        "created_at",
        "config_hash",
        "config_version",
        "processing_parameters",
        "id",  # redundant with projectID
    ]
    df = df.drop(columns=[col for col in metadata_columns if col in df.columns])

    # Validate but don't remove data
    if "Fueltype" in df.columns:
        invalid_fuels = (
            df["Fueltype"].dropna().apply(lambda x: x not in valid_fueltypes)
        )
        if invalid_fuels.any():
            logger.warning(
                f"Found {invalid_fuels.sum()} rows with invalid Fueltype values"
            )

    if "Technology" in df.columns:
        invalid_techs = (
            df["Technology"].dropna().apply(lambda x: x not in valid_technologies)
        )
        if invalid_techs.any():
            logger.warning(
                f"Found {invalid_techs.sum()} rows with invalid Technology values"
            )

    if "Set" in df.columns:
        invalid_sets = df["Set"].dropna().apply(lambda x: x not in valid_sets)
        if invalid_sets.any():
            logger.warning(f"Found {invalid_sets.sum()} rows with invalid Set values")

    return df

Interface API¶

osm_powerplants.interface ¶

VALID_FUELTYPES = ['Nuclear', 'Solid Biomass', 'Biogas', 'Wind', 'Hydro', 'Solar', 'Oil', 'Natural Gas', 'Hard Coal', 'Lignite', 'Geothermal', 'Waste', 'Other'] module-attribute ¶

VALID_TECHNOLOGIES = ['Steam Turbine', 'OCGT', 'CCGT', 'Run-Of-River', 'Reservoir', 'Pumped Storage', 'Offshore', 'Onshore', 'PV', 'CSP', 'Combustion Engine', 'Marine'] module-attribute ¶

VALID_SETS = ['PP', 'CHP', 'Store'] module-attribute ¶

process_countries(countries, csv_cache_path, cache_dir, update, osm_config, raw=False) ¶

process_units(countries, config, cache_dir, output_path=None, raw=True) ¶

validate_countries(countries, omitted_countries=[]) ¶

validate_and_standardize_df(df, valid_fueltypes=None, valid_technologies=None, valid_sets=None) ¶

`osm_powerplants.interface` ¶

`VALID_FUELTYPES = ['Nuclear', 'Solid Biomass', 'Biogas', 'Wind', 'Hydro', 'Solar', 'Oil', 'Natural Gas', 'Hard Coal', 'Lignite', 'Geothermal', 'Waste', 'Other']` `module-attribute` ¶

`VALID_TECHNOLOGIES = ['Steam Turbine', 'OCGT', 'CCGT', 'Run-Of-River', 'Reservoir', 'Pumped Storage', 'Offshore', 'Onshore', 'PV', 'CSP', 'Combustion Engine', 'Marine']` `module-attribute` ¶

`VALID_SETS = ['PP', 'CHP', 'Store']` `module-attribute` ¶

`process_countries(countries, csv_cache_path, cache_dir, update, osm_config, raw=False)` ¶

`process_units(countries, config, cache_dir, output_path=None, raw=True)` ¶

`validate_countries(countries, omitted_countries=[])` ¶

`validate_and_standardize_df(df, valid_fueltypes=None, valid_technologies=None, valid_sets=None)` ¶