Source code for dalia.csvv

"""
Reading and representing CSVV text files
"""
from collections import defaultdict
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import yaml


[docs]@dataclass class Csvv: """Data from a CSVV file """ oneline: str version: str dependencies: str description: str csvv_version: str variables: Dict[str, str] observations: int prednames: List[str] covarnames: List[str] gamma: np.ndarray gammavcv: np.ndarray residvcv: float
def _parse_csvvlines(lines, sep=","): """List of str from CSVV file to dict Parameters ---------- lines : sequence of str To parse. sep : str or None, optional Value delimiter for the body of the CSVV file. If None, delimits white space. Returns ------- meta : dict """ # This all could be done better... header_sections = [ "oneline", "version", "dependencies", "description", "csvv-version", "variables", ] body_sections = [ "observations", "prednames", "covarnames", "gamma", "gammavcv", "residvcv", ] # Divide into header an body header_lines = [] body = defaultdict(list) inheader = False inbody = False # Grab header. Find start idx of body. n_lines = len(lines) for idx, l in enumerate(lines): if l.strip() == "---": # First line of file, indicates header incoming. inheader = True inbody = False continue if l.strip() == "..." and n_lines > idx: # First transition to body. inheader = False inbody = True continue elif l.strip() == "...": raise IndexError("CSVV body has too few lines") if inheader: header_lines.append(l.rstrip()) continue if inbody: if l.strip() in body_sections: # This is a body section. inbody = l.strip() continue elif inbody in body_sections: # This is a data section. body[inbody].append([x.strip() for x in l.split(sep)]) meta = yaml.load("\n".join(header_lines), Loader=yaml.SafeLoader) # Combine sections meta.update(body) assert set(list(meta.keys())) == set(header_sections + body_sections) # Clean body data # Flatten nested lists where needed. for k in ["prednames", "covarnames", "gamma", "residvcv", "observations"]: meta[k] = [item for sublist in meta[k] for item in sublist] # Check for correct len. n = len(meta["gammavcv"]) for k in ["prednames", "covarnames", "gamma"]: assert len(meta[k]) == n, f"{k} does not contain {n} elements" # Cast numerics from strings for k in ["gamma", "gammavcv", "residvcv"]: meta[k] = np.array(meta[k], dtype="float") meta["observations"] = np.array(meta["observations"], dtype="int") # Arrays to scalars meta["observations"] = meta["observations"].item() meta["residvcv"] = meta["residvcv"].item() # meta keys eventually become attrs, so remove "-" meta["csvv_version"] = meta.pop("csvv-version") return meta
[docs]def read_csvv(filepath_or_buffer, sep=","): """Read a CSVV file into a CSVV object Parameters ---------- filepath_or_buffer str path to target file or opened buffer. sep : str or None, optional Value delimiter for the body of the CSVV file. If None, delimits whitespace. Returns ------- Csvv """ if isinstance(filepath_or_buffer, str): with open(filepath_or_buffer, "r") as fl: fl_guts = fl.readlines() else: fl_guts = filepath_or_buffer.readlines() return Csvv(**_parse_csvvlines(fl_guts, sep=sep))