Source code for dalia.csvv

"""
Reading and representing CSVV text files
"""
from collections import defaultdict
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import yaml


[docs]@dataclass
class Csvv:
    """Data from a CSVV file
    """

    oneline: str
    version: str
    dependencies: str
    description: str
    csvv_version: str
    variables: Dict[str, str]
    observations: int
    prednames: List[str]
    covarnames: List[str]
    gamma: np.ndarray
    gammavcv: np.ndarray
    residvcv: float


def _parse_csvvlines(lines, sep=","):
    """List of str from CSVV file to dict

    Parameters
    ----------
    lines : sequence of str
        To parse.
    sep : str or None, optional
        Value delimiter for the body of the CSVV file. If None, delimits white
        space.

    Returns
    -------
    meta : dict
    """
    # This all could be done better...
    header_sections = [
        "oneline",
        "version",
        "dependencies",
        "description",
        "csvv-version",
        "variables",
    ]
    body_sections = [
        "observations",
        "prednames",
        "covarnames",
        "gamma",
        "gammavcv",
        "residvcv",
    ]
    # Divide into header an body
    header_lines = []
    body = defaultdict(list)

    inheader = False
    inbody = False

    # Grab header. Find start idx of body.
    n_lines = len(lines)
    for idx, l in enumerate(lines):

        if l.strip() == "---":
            # First line of file, indicates header incoming.
            inheader = True
            inbody = False
            continue

        if l.strip() == "..." and n_lines > idx:
            # First transition to body.
            inheader = False
            inbody = True
            continue
        elif l.strip() == "...":
            raise IndexError("CSVV body has too few lines")

        if inheader:
            header_lines.append(l.rstrip())
            continue

        if inbody:
            if l.strip() in body_sections:
                # This is a body section.
                inbody = l.strip()
                continue
            elif inbody in body_sections:
                # This is a data section.
                body[inbody].append([x.strip() for x in l.split(sep)])

    meta = yaml.load("\n".join(header_lines), Loader=yaml.SafeLoader)
    # Combine sections
    meta.update(body)

    assert set(list(meta.keys())) == set(header_sections + body_sections)

    # Clean body data
    # Flatten nested lists where needed.
    for k in ["prednames", "covarnames", "gamma", "residvcv", "observations"]:
        meta[k] = [item for sublist in meta[k] for item in sublist]

    # Check for correct len.
    n = len(meta["gammavcv"])
    for k in ["prednames", "covarnames", "gamma"]:
        assert len(meta[k]) == n, f"{k} does not contain {n} elements"

    # Cast numerics from strings
    for k in ["gamma", "gammavcv", "residvcv"]:
        meta[k] = np.array(meta[k], dtype="float")
    meta["observations"] = np.array(meta["observations"], dtype="int")

    # Arrays to scalars
    meta["observations"] = meta["observations"].item()
    meta["residvcv"] = meta["residvcv"].item()

    # meta keys eventually become attrs, so remove "-"
    meta["csvv_version"] = meta.pop("csvv-version")
    return meta


[docs]def read_csvv(filepath_or_buffer, sep=","):
    """Read a CSVV file into a CSVV object

    Parameters
    ----------
    filepath_or_buffer
        str path to target file or opened buffer.
    sep : str or None, optional
        Value delimiter for the body of the CSVV file. If None, delimits
        whitespace.

    Returns
    -------
    Csvv
    """
    if isinstance(filepath_or_buffer, str):
        with open(filepath_or_buffer, "r") as fl:
            fl_guts = fl.readlines()
    else:
        fl_guts = filepath_or_buffer.readlines()

    return Csvv(**_parse_csvvlines(fl_guts, sep=sep))