Skip to content

Interface API

osm_powerplants.interface

OpenStreetMap power plant data interface.

This module provides the main interface for extracting and processing power plant data from OpenStreetMap. It handles country validation, multi-level caching, and data processing.

Main functions: process_units: Simplified entry point for most use cases process_countries: Lower-level function with more options validate_countries: Validate country names with fuzzy matching

VALID_FUELTYPES = ['Nuclear', 'Solid Biomass', 'Biogas', 'Wind', 'Hydro', 'Solar', 'Oil', 'Natural Gas', 'Hard Coal', 'Lignite', 'Geothermal', 'Waste', 'Other'] module-attribute

VALID_TECHNOLOGIES = ['Steam Turbine', 'OCGT', 'CCGT', 'Run-Of-River', 'Reservoir', 'Pumped Storage', 'Offshore', 'Onshore', 'PV', 'CSP', 'Combustion Engine', 'Marine'] module-attribute

VALID_SETS = ['PP', 'CHP', 'Store'] module-attribute

process_countries(countries, csv_cache_path, cache_dir, update, osm_config, raw=False)

Process power plant data for specified countries.

Parameters:

Name Type Description Default
countries list of str

Country names or ISO codes

required
csv_cache_path str

Path to CSV cache file

required
cache_dir str

Cache directory path

required
update bool

Force cache update if True

required
osm_config dict

Configuration dictionary

required
raw bool

If True, return all columns including metadata

False

Returns:

Type Description
DataFrame

Power plant data

Source code in src/osm_powerplants/interface.py
def process_countries(
    countries, csv_cache_path, cache_dir, update, osm_config, raw=False
):
    """Process power plant data for specified countries.

    Parameters
    ----------
    countries : list of str
        Country names or ISO codes
    csv_cache_path : str
        Path to CSV cache file
    cache_dir : str
        Cache directory path
    update : bool
        Force cache update if True
    osm_config : dict
        Configuration dictionary
    raw : bool, default False
        If True, return all columns including metadata

    Returns
    -------
    pd.DataFrame
        Power plant data
    """
    logger.info(f"Starting country validation for {len(countries)} countries...")

    api_url = osm_config.get("overpass_api", {}).get("api_url")
    current_config_hash = Unit._generate_config_hash(osm_config)
    force_refresh = osm_config.get("force_refresh", False)
    omitted_countries = osm_config.get("omitted_countries", [])
    try:
        valid_countries, country_code_map = validate_countries(
            countries, omitted_countries
        )
    except ValueError as e:
        raise ValueError(
            f"Country validation failed. Cannot proceed with OSM data processing.\n{str(e)}"
        ) from e

    logger.info(
        f"Country validation successful! Processing OSM data for {len(valid_countries)} countries: "
        f"{', '.join(valid_countries[:5])}"
        f"{f' and {len(valid_countries) - 5} more' if len(valid_countries) > 5 else ''}"
    )

    all_valid_data = pd.DataFrame()

    # Create single client for all countries
    client_params = get_client_params(osm_config, api_url, cache_dir)

    with OverpassAPIClient(**client_params) as client:
        for i, country in enumerate(valid_countries, 1):
            logger.info(
                f"Processing country {i}/{len(valid_countries)}: {country} ({country_code_map[country]})"
            )

            country_data = process_single_country(
                country,
                csv_cache_path,
                current_config_hash,
                update,
                force_refresh,
                osm_config,
                client,
            )

            if country_data is not None and not country_data.empty:
                if not raw:
                    country_data = validate_and_standardize_df(
                        country_data,
                        VALID_FUELTYPES,
                        VALID_TECHNOLOGIES,
                        VALID_SETS,
                    )
                all_valid_data = pd.concat(
                    [all_valid_data, country_data], ignore_index=True
                )

    logger.info(f"✅ Successfully processed all {len(valid_countries)} countries")
    return all_valid_data

process_units(countries, config, cache_dir, output_path=None, raw=True)

Process power plant data for specified countries.

Parameters:

Name Type Description Default
countries list[str]

Country names or ISO codes (e.g., ['Germany', 'FR', 'ESP'])

required
config dict

Configuration from get_config()

required
cache_dir str

Cache directory from get_cache_dir()

required
output_path str

Save CSV to this path if provided

None
raw bool

If True, return all columns. If False, remove metadata columns (config_hash, created_at, processing_parameters, id).

True

Returns:

Type Description
DataFrame

Power plant data with columns: projectID, Name, Country, lat, lon, Fueltype, Technology, Set, Capacity, DateIn, type, capacity_source. When raw=True, also includes metadata columns.

Examples:

>>> from osm_powerplants import process_units, get_config, get_cache_dir
>>> config = get_config()
>>> df = process_units(
...     countries=['Malta', 'Luxembourg'],
...     config=config,
...     cache_dir=str(get_cache_dir(config)),
... )
Source code in src/osm_powerplants/interface.py
def process_units(
    countries: list[str],
    config: dict,
    cache_dir: str,
    output_path: str | None = None,
    raw: bool = True,
) -> pd.DataFrame:
    """Process power plant data for specified countries.

    Parameters
    ----------
    countries : list[str]
        Country names or ISO codes (e.g., ['Germany', 'FR', 'ESP'])
    config : dict
        Configuration from get_config()
    cache_dir : str
        Cache directory from get_cache_dir()
    output_path : str, optional
        Save CSV to this path if provided
    raw : bool, default True
        If True, return all columns. If False, remove metadata columns
        (config_hash, created_at, processing_parameters, id).

    Returns
    -------
    pd.DataFrame
        Power plant data with columns: projectID, Name, Country, lat, lon,
        Fueltype, Technology, Set, Capacity, DateIn, type, capacity_source.
        When raw=True, also includes metadata columns.

    Examples
    --------
    >>> from osm_powerplants import process_units, get_config, get_cache_dir
    >>> config = get_config()
    >>> df = process_units(
    ...     countries=['Malta', 'Luxembourg'],
    ...     config=config,
    ...     cache_dir=str(get_cache_dir(config)),
    ... )
    """
    import os

    csv_cache_path = os.path.join(cache_dir, "osm_data.csv")
    update = config.get("force_refresh", False)

    df = process_countries(
        countries=countries,
        csv_cache_path=csv_cache_path,
        cache_dir=cache_dir,
        update=update,
        osm_config=config,
        raw=raw,
    )

    if output_path and not df.empty:
        df.to_csv(output_path, index=False)

    return df

validate_countries(countries, omitted_countries=[])

Validate country names and provide helpful suggestions for invalid entries.

Uses pycountry to validate country names, supporting full names, ISO codes, and common variations. Provides fuzzy matching suggestions for typos or incorrect names. Checks against omitted countries list from configuration.

Parameters:

Name Type Description Default
countries list of str

Country names to validate. Accepts: - Full names: 'Germany', 'United States' - ISO 3166-1 alpha-2: 'DE', 'US' - ISO 3166-1 alpha-3: 'DEU', 'USA' - Common variations: 'USA', 'UK', 'South Korea'

required
omitted_countries list of str

List of countries to omit from processing

[]

Returns:

Name Type Description
valid_countries list of str

List of validated country names as provided (minus omitted ones)

country_code_map dict

Mapping of country names to ISO alpha-2 codes

Raises:

Type Description
ValueError

If any country names are invalid. Error message includes suggestions for similar valid names and shows which entries were valid vs invalid.

Examples:

>>> valid, codes = validate_countries(['Germany', 'France'])
>>> print(codes)
{'Germany': 'DE', 'France': 'FR'}
>>> validate_countries(['Germny'])  # Typo
ValueError: ❌ Invalid country names detected...
     ℹ️  Did you mean: 'Germany', 'Armenia'
Source code in src/osm_powerplants/interface.py
def validate_countries(
    countries: list[str], omitted_countries: list[str] = []
) -> tuple[list[str], dict[str, str]]:
    """Validate country names and provide helpful suggestions for invalid entries.

    Uses pycountry to validate country names, supporting full names, ISO codes,
    and common variations. Provides fuzzy matching suggestions for typos or
    incorrect names. Checks against omitted countries list from configuration.

    Parameters
    ----------
    countries : list of str
        Country names to validate. Accepts:
        - Full names: 'Germany', 'United States'
        - ISO 3166-1 alpha-2: 'DE', 'US'
        - ISO 3166-1 alpha-3: 'DEU', 'USA'
        - Common variations: 'USA', 'UK', 'South Korea'

    omitted_countries : list of str, optional
        List of countries to omit from processing

    Returns
    -------
    valid_countries : list of str
        List of validated country names as provided (minus omitted ones)
    country_code_map : dict
        Mapping of country names to ISO alpha-2 codes

    Raises
    ------
    ValueError
        If any country names are invalid. Error message includes
        suggestions for similar valid names and shows which entries
        were valid vs invalid.

    Examples
    --------
    >>> valid, codes = validate_countries(['Germany', 'France'])
    >>> print(codes)
    {'Germany': 'DE', 'France': 'FR'}

    >>> validate_countries(['Germny'])  # Typo
    ValueError: ❌ Invalid country names detected...
         ℹ️  Did you mean: 'Germany', 'Armenia'
    """
    import pycountry

    countries_to_be_omitted = []
    if omitted_countries:
        # Filter out omitted countries
        filtered_countries = []
        for country in countries:
            if country in omitted_countries:
                logger.info(
                    f"Omitting country '{country}' as specified in configuration (omitted_countries)"
                )
                countries_to_be_omitted.append(country)
            else:
                filtered_countries.append(country)
        countries = filtered_countries

    # If no countries left after omission, return empty results
    if not countries:
        logger.warning(
            "No countries left to process after applying omitted_countries filter"
        )
        return [], {}

    valid_countries = []
    invalid_countries = []
    country_code_map = {}

    for country in countries:
        country_code = get_country_code(country)
        if country_code is not None:
            valid_countries.append(country)
            country_code_map[country] = country_code
        else:
            invalid_countries.append(country)

    if invalid_countries:
        all_country_names = [c.name for c in pycountry.countries]  # type: ignore
        all_country_names.extend([c.alpha_2 for c in pycountry.countries])  # type: ignore
        all_country_names.extend([c.alpha_3 for c in pycountry.countries])  # type: ignore

        country_variations = {
            "USA": "United States",
            "UK": "United Kingdom",
            "South Korea": "Korea, Republic of",
            "North Korea": "Korea, Democratic People's Republic of",
            "Russia": "Russian Federation",
            "Iran": "Iran, Islamic Republic of",
            "Syria": "Syrian Arab Republic",
            "Venezuela": "Venezuela, Bolivarian Republic of",
            "Bolivia": "Bolivia, Plurinational State of",
            "Tanzania": "Tanzania, United Republic of",
            "Vietnam": "Viet Nam",
            "Czech Republic": "Czechia",
            "Macedonia": "North Macedonia",
            "Turkey": "Türkiye",
        }
        all_country_names.extend(country_variations.keys())

        error_parts = [
            f"❌ Invalid country names detected: {len(invalid_countries)} out of {len(countries + countries_to_be_omitted)} countries",
        ]

        if countries_to_be_omitted:
            error_parts.append(
                f"\n⏭️  Omitted countries (from countries_to_be_omitted config): {', '.join(countries_to_be_omitted)}"
            )

        error_parts.append("\nInvalid entries:")

        for invalid in invalid_countries:
            error_parts.append(f"  ❌ '{invalid}'")

            suggestions = get_close_matches(invalid, all_country_names, n=3, cutoff=0.6)
            if suggestions:
                strings = []
                for suggestion in suggestions:
                    strings.append(f"'{suggestion}'")
                error_parts.append(f"     ℹ️  Did you mean: {', '.join(strings)}")

            if invalid in country_variations:
                error_parts.append(
                    f"     ℹ️  Try using: '{country_variations[invalid]}'"
                )

        error_parts.extend(
            [
                "\n✅ Valid entries:",
                *[
                    f"  ✅ '{valid}' → {country_code_map[valid]}"
                    for valid in valid_countries[:5]
                ],
                f"  ... and {len(valid_countries) - 5} more"
                if len(valid_countries) > 5
                else "",
                "\n📝 Accepted formats:",
                "  - Full name: 'Germany', 'United States'",
                "  - ISO 3166-1 alpha-2: 'DE', 'US'",
                "  - ISO 3166-1 alpha-3: 'DEU', 'USA'",
                "  - Common names: 'USA', 'UK', 'South Korea'",
                "\n⚠️  All countries must be valid before processing can begin.",
                "Please correct the invalid entries and try again.",
            ]
        )

        error_msg = "\n".join(filter(None, error_parts))
        logger.error(error_msg)
        raise ValueError(error_msg)

    logger.info(f"✅ Successfully validated all {len(valid_countries)} countries")
    if countries_to_be_omitted:
        logger.info(
            f"ℹ️  Note: {len(countries_to_be_omitted)} countries were omitted per configuration"
        )
    logger.debug(
        f"Country codes: {', '.join(f'{c}={code}' for c, code in country_code_map.items())}"
    )

    return valid_countries, country_code_map

validate_and_standardize_df(df, valid_fueltypes=None, valid_technologies=None, valid_sets=None)

Remove metadata columns and validate data.

Removes cache-related columns (config_hash, created_at, etc.) and logs warnings for invalid fuel types, technologies, or sets.

Parameters:

Name Type Description Default
df DataFrame

Power plant data

required
valid_fueltypes list of str

Allowed fuel types

None
valid_technologies list of str

Allowed technologies

None
valid_sets list of str

Allowed set types

None

Returns:

Type Description
DataFrame

DataFrame with metadata columns removed

Source code in src/osm_powerplants/interface.py
def validate_and_standardize_df(
    df, valid_fueltypes=None, valid_technologies=None, valid_sets=None
):
    """Remove metadata columns and validate data.

    Removes cache-related columns (config_hash, created_at, etc.) and
    logs warnings for invalid fuel types, technologies, or sets.

    Parameters
    ----------
    df : pd.DataFrame
        Power plant data
    valid_fueltypes : list of str, optional
        Allowed fuel types
    valid_technologies : list of str, optional
        Allowed technologies
    valid_sets : list of str, optional
        Allowed set types

    Returns
    -------
    pd.DataFrame
        DataFrame with metadata columns removed
    """
    if df.empty:
        return df

    if valid_fueltypes is None:
        valid_fueltypes = VALID_FUELTYPES
    if valid_technologies is None:
        valid_technologies = VALID_TECHNOLOGIES
    if valid_sets is None:
        valid_sets = VALID_SETS

    df = df.copy()

    # Remove only cache-related metadata columns
    metadata_columns = [
        "created_at",
        "config_hash",
        "config_version",
        "processing_parameters",
        "id",  # redundant with projectID
    ]
    df = df.drop(columns=[col for col in metadata_columns if col in df.columns])

    # Validate but don't remove data
    if "Fueltype" in df.columns:
        invalid_fuels = (
            df["Fueltype"].dropna().apply(lambda x: x not in valid_fueltypes)
        )
        if invalid_fuels.any():
            logger.warning(
                f"Found {invalid_fuels.sum()} rows with invalid Fueltype values"
            )

    if "Technology" in df.columns:
        invalid_techs = (
            df["Technology"].dropna().apply(lambda x: x not in valid_technologies)
        )
        if invalid_techs.any():
            logger.warning(
                f"Found {invalid_techs.sum()} rows with invalid Technology values"
            )

    if "Set" in df.columns:
        invalid_sets = df["Set"].dropna().apply(lambda x: x not in valid_sets)
        if invalid_sets.any():
            logger.warning(f"Found {invalid_sets.sum()} rows with invalid Set values")

    return df