Source code for thermo.gpumd.data

import numpy as np
import pandas as pd
import os
import copy
import multiprocessing as mp
from functools import partial
from collections import deque
from .common import __get_path, __get_direction, __check_list, __check_range


__author__ = "Alexander Gabourie"
__email__ = "gabourie@stanford.edu"

#########################################
# Helper Functions
#########################################


def __process_sample(nbins, i):
    """
    A helper function for the multiprocessing of kappamode.out files

    Args:
        nbins (int):
            Number of bins used in the GPUMD simulation

        i (int):
            The current sample from a run to analyze

    Returns:
        np.ndarray: A 2D array of each bin and output for a sample


    """
    out = list()
    for j in range(nbins):
        out += [float(x) for x in malines[j + i * nbins].split()]
    return np.array(out).reshape((nbins,5))


[docs]def tail(f, nlines, BLOCK_SIZE=32768):
    """
    Reads the last nlines of a file.

    Args:
        f (filehandle):
            File handle of file to be read

        nlines (int):
            Number of lines to be read from end of file

        BLOCK_SIZE (int):
            Size of block (in bytes) to be read per read operation.
            Performance depend on this parameter and file size.

    Returns:
        list: List of ordered final nlines of file

    Additional Information:
    Since GPUMD output files are mostly append-only, this becomes
    useful when a simulation prematurely ends (i.e. cluster preempts
    run, but simulation restarts elsewhere). In this case, it is not
    necessary to clean the directory before re-running. File outputs
    will be too long (so there still is a storage concern), but the
    proper data can be extracted from the end of file.
    This may also be useful if you want to only grab data from the
    final m number of runs of the simulation
    """
    # BLOCK_SIZE is in bytes (must decode to string)
    f.seek(0, 2)
    bytes_remaining = f.tell()
    idx = -BLOCK_SIZE
    blocks = list()
    # Make no assumptions about line length
    lines_left = nlines
    eof = False
    first = True
    num_lines = 0

    # BLOCK_size is smaller than file
    if BLOCK_SIZE <= bytes_remaining:
        while lines_left > 0 and not eof:
            if bytes_remaining > BLOCK_SIZE:
                f.seek(idx, 2)
                blocks.append(f.read(BLOCK_SIZE))
            else:  # if reached end of file
                f.seek(0, 0)
                blocks.append(f.read(bytes_remaining))
                eof = True

            idx -= BLOCK_SIZE
            bytes_remaining -= BLOCK_SIZE
            num_lines = blocks[-1].count(b'\n')
            if first:
                lines_left -= num_lines - 1
                first = False
            else:
                lines_left -= num_lines

            # since whitespace removed from eof, must compare to 1 here
            if eof and lines_left > 1:
                raise ValueError("More lines requested than exist.")

        # Corrects for reading too many lines with large buffer
        if bytes_remaining > 0:
            skip = 1 + abs(lines_left)
            blocks[-1] = blocks[-1].split(b'\n', skip)[skip]
        text = b''.join(reversed(blocks)).strip()
    else:  # BLOCK_SIZE is bigger than file
        f.seek(0, 0)
        block = f.read()
        num_lines = block.count(b'\n')
        if num_lines < nlines:
            raise ValueError("More lines requested than exist.")
        skip = num_lines - nlines
        text = block.split(b'\n', skip)[skip].strip()
    return text.split(b'\n')


def __modal_analysis_read(nbins, nsamples, datapath,
                          ndiv, multiprocessing, ncore, block_size):

    global malines
    # Get full set of results
    datalines = nbins * nsamples
    with open(datapath, 'rb') as f:
        if multiprocessing:
            malines = tail(f, datalines, BLOCK_SIZE=block_size)
        else:
            malines = deque(tail(f, datalines, BLOCK_SIZE=block_size))

    if multiprocessing:  # TODO Improve memory efficiency of multiprocessing
        if not ncore:
            ncore = mp.cpu_count()

        func = partial(__process_sample, nbins)
        pool = mp.Pool(ncore)
        data = np.array(pool.map(func, range(nsamples)), dtype='float32').transpose((1, 0, 2))
        pool.close()

    else:  # Faster if single thread
        data = np.zeros((nbins, nsamples, 5), dtype='float32')
        for j in range(nsamples):
            for i in range(nbins):
                measurements = malines.popleft().split()
                data[i, j, 0] = float(measurements[0])
                data[i, j, 1] = float(measurements[1])
                data[i, j, 2] = float(measurements[2])
                data[i, j, 3] = float(measurements[3])
                data[i, j, 4] = float(measurements[4])

    del malines
    if ndiv:
        nbins = int(np.ceil(data.shape[0] / ndiv))  # overwrite nbins
        npad = nbins * ndiv - data.shape[0]
        data = np.pad(data, [(0, npad), (0, 0), (0, 0)])
        data = np.sum(data.reshape((-1, ndiv, data.shape[1], data.shape[2])), axis=1)

    return data


def __basic_reader(points, data, labels):
    start = 0
    out = dict()
    for i, npoints in enumerate(points):
        end = start + npoints
        run = dict()
        for j, key in enumerate(labels):
            run[key] = data[j][start:end].to_numpy(dtype='float')
        start = end
        out['run{}'.format(i)] = run
    return out


def __basic_frame_loader(n, directory, filename):
    path = __get_path(directory, filename)
    data = pd.read_csv(path, delim_whitespace=True, header=None).to_numpy(dtype='float')
    if not (data.shape[0] / n).is_integer():
        raise ValueError("An integer number of frames cannot be created. Please check n.")
    return data.reshape(n, 3, -1)


#########################################
# Data-loading Related
#########################################

[docs]def load_omega2(directory=None, filename='omega2.out'):
    """
    Loads data from omega2.out GPUMD output file.\n

    Args:
        directory (str):
            Directory to load force file from

        filename (str):
            Name of force data file

    Returns:
        Numpy array of shape (N_kpoints,3*N_basis) in units of THz. N_kpoints is number of k points in kpoint.in and
        N_basis is the number of basis atoms defined in basis.in

    """
    path = __get_path(directory, filename)
    data = pd.read_csv(path, delim_whitespace=True, header=None).to_numpy(dtype='float')
    data = np.sqrt(data)/(2*np.pi)
    return data


[docs]def load_force(n, directory=None, filename='force.out'):
    """
    Loads data from force.out GPUMD output file.\n
    Currently supports loading a single run.

    Args:
        n (int):
            Number of atoms force is output for

        directory (str):
            Directory to load force file from

        filename (str):
            Name of force data file

    Returns:
        Numpy array of shape (n,3,-1) containing all forces (ev/A) from filename

    """
    return __basic_frame_loader(n, directory, filename)


[docs]def load_velocity(n, directory=None, filename='velocity.out'):
    """
    Loads data from velocity.out GPUMD output file.\n
    Currently supports loading a single run.

    Args:
        n (int):
            Number of atoms velocity is output for

        directory (str):
            Directory to load velocity file from

        filename (str):
            Name of velocity data file

    Returns:
        Numpy array of shape (n,3,-1) containing all forces (A/ps) from filename

    """
    return __basic_frame_loader(n, directory, filename)


[docs]def load_compute(quantities=None, directory=None, filename='compute.out'):
    """
    Loads data from compute.out GPUMD output file.\n
    Currently supports loading a single run.

    Args:
        quantities (str or list(str)):
            Quantities to extract from compute.out Accepted quantities are:\n
            ['T', 'U', 'F', 'W', 'jp', 'jk']. \n
            Other quantity will be ignored.\n
            T=temperature, U=potential, F=force, W=virial, jp=heat current (potential), jk=heat current (kinetic)

        directory (str):
            Directory to load compute file from

        filename (str):
            file to load compute from

    Returns:
        Dictionary containing the data from compute.out

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,T,U,F,W,jp,jk,Ein,Eout
       **units**,K,eV,|c1|,eV,|c2|,|c2|,eV,eV

   .. |c1| replace:: eVA\ :sup:`-1`
   .. |c2| replace:: eV\ :sup:`3/2` amu\ :sup:`-1/2`
    """
    # TODO Add input checking
    if not quantities:
        return None
    compute_path = __get_path(directory, filename)
    data = pd.read_csv(compute_path, delim_whitespace=True, header=None)

    num_col = len(data.columns)
    q_count = {'T': 1, 'U': 1, 'F': 3, 'W': 3, 'jp': 3, 'jk': 3}
    out = dict()

    count = 0
    for value in quantities:
        count += q_count[value]

    m = int(num_col / count)
    if 'T' in quantities:
        m = int((num_col - 2) / count)
        out['Ein'] = np.array(data.iloc[:, -2])
        out['Eout'] = np.array(data.iloc[:, -1])

    out['m'] = m
    start = 0
    for quantity in q_count.keys():
        if quantity in quantities:
            end = start + q_count[quantity]*m
            out[quantity] = data.iloc[:, start:end].to_numpy(dtype='float')
            start = end

    return out


[docs]def load_thermo(directory=None, filename='thermo.out'):
    """
    Loads data from thermo.out GPUMD output file.

    Args:
        directory (str):
            Directory to load thermal data file from

        filename (str):
            Name of thermal data file

        Returns:
            'output' dictionary containing the data from thermo.out

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,T,K,U,Px,Py,Pz,Lx,Ly,Lz,ax,ay,az,bx,by,bz,cx,cy,cz
       **units**,K,eV,eV,GPa,GPa,GPa,A,A,A,A,A,A,A,A,A,A,A,A

    """
    thermo_path = __get_path(directory, filename)
    data = pd.read_csv(thermo_path, delim_whitespace=True, header=None)
    labels = ['T', 'K', 'U', 'Px', 'Py', 'Pz']
    # Orthogonal
    if data.shape[1] == 9:
        labels += ['Lx', 'Ly', 'Lz']
    elif data.shape[1] == 15:
        labels += ['ax', 'ay', 'az', 'bx', 'by', 'bz', 'cx', 'cy', 'cz']

    out = dict()
    for i in range(data.shape[1]):
        out[labels[i]] = data[i].to_numpy(dtype='float')

    return out


[docs]def load_heatmode(nbins, nsamples, directory=None,
                  inputfile='heatmode.out', directions='xyz',
                  outputfile='heatmode.npy', ndiv=None, save=False,
                  multiprocessing=False, ncore=None, block_size=65536, return_data=True):
    """
    Loads data from heatmode.out GPUMD file. Option to save as binary file for fast re-load later.
    WARNING: If using multiprocessing, memory usage may be significantly larger than file size

    Args:
        nbins (int):
            Number of bins used during the GPUMD simulation

        nsamples (int):
            Number of times heat flux was sampled with GKMA during GPUMD simulation

        directory (str):
            Name of directory storing the input file to read

        inputfile (str):
            Modal heat flux file output by GPUMD

        directions (str):
            Directions to gather data from. Any order of 'xyz' is accepted. Excluding directions also allowed (i.e. 'xz'
            is accepted)

        outputfile (str):
            File name to save read data to. Output file is a binary dictionary. Loading from a binary file is much
            faster than re-reading data files and saving is recommended

        ndiv (int):
            Integer used to shrink number of bins output. If originally have 10 bins, but want 5, ndiv=2. nbins/ndiv
            need not be an integer

        save (bool):
            Toggle saving data to binary dictionary. Loading from save file is much faster and recommended

        multiprocessing (bool):
            Toggle using multi-core processing for conversion of text file

        ncore (bool):
            Number of cores to use for multiprocessing. Ignored if multiprocessing is False

        block_size (int):
            Size of block (in bytes) to be read per read operation. File reading performance depend on this parameter
            and file size

        return_data (bool):
            Toggle returning the loaded modal heat flux data. If this is False, the user should ensure that
            save is True


        Returns:
                dict: Dictionary with all modal heat fluxes requested

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,nbins, nsamples, jmxi, jmxo, jmyi, jmyo, jmz
       **units**,N/A, N/A,|jm1|,|jm1|,|jm1|,|jm1|,|jm1|

    .. |jm1| replace:: eV\ :sup:`3/2` amu\ :sup:`-1/2` *x*\ :sup:`-1`


    Here *x* is the size of the bins in THz. For example, if there are 4 bins per THz, *x* = 0.25 THz.
    """
    jm_path = __get_path(directory, inputfile)
    out_path = __get_path(directory, outputfile)
    data = __modal_analysis_read(nbins, nsamples, jm_path, ndiv, multiprocessing, ncore, block_size)
    out = dict()
    directions = __get_direction(directions)
    if 'x' in directions:
        out['jmxi'] = data[:, :, 0]
        out['jmxo'] = data[:, :, 1]
    if 'y' in directions:
        out['jmyi'] = data[:, :, 2]
        out['jmyo'] = data[:, :, 3]
    if 'z' in directions:
        out['jmz'] = data[:, :, 4]

    out['nbins'] = nbins
    out['nsamples'] = nsamples

    if save:
        np.save(out_path, out)

    if return_data:
        return out
    return


[docs]def load_kappamode(nbins, nsamples, directory=None,
                   inputfile='kappamode.out', directions='xyz',
                   outputfile='kappamode.npy', ndiv=None, save=False,
                   multiprocessing=False, ncore=None, block_size=65536, return_data=True):
    """
    Loads data from kappamode.out GPUMD file. Option to save as binary file for fast re-load later.
    WARNING: If using multiprocessing, memory useage may be significantly larger than file size

    Args:
        nbins (int):
            Number of bins used during the GPUMD simulation

        nsamples (int):
            Number of times thermal conductivity was sampled with HNEMA during GPUMD simulation

        directory (str):
            Name of directory storing the input file to read

        inputfile (str):
            Modal thermal conductivity file output by GPUMD

        directions (str):
            Directions to gather data from. Any order of 'xyz' is accepted. Excluding directions also allowed (i.e. 'xz'
            is accepted)

        outputfile (str):
            File name to save read data to. Output file is a binary dictionary. Loading from a binary file is much
            faster than re-reading data files and saving is recommended

        ndiv (int):
            Integer used to shrink number of bins output. If originally have 10 bins, but want 5, ndiv=2. nbins/ndiv
            need not be an integer

        save (bool):
            Toggle saving data to binary dictionary. Loading from save file is much faster and recommended

        multiprocessing (bool):
            Toggle using multi-core processing for conversion of text file

        ncore (bool):
            Number of cores to use for multiprocessing. Ignored if multiprocessing is False

        block_size (int):
            Size of block (in bytes) to be read per read operation. File reading performance depend on this parameter
            and file size

        return_data (bool):
            Toggle returning the loaded modal thermal conductivity data. If this is False, the user should ensure that
            save is True


        Returns:
                dict: Dictionary with all modal thermal conductivities requested

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,nbins,nsamples,kmxi,kmxo,kmyi,kmyo,kmz
       **units**,N/A,N/A,|hn1|,|hn1|,|hn1|,|hn1|,|hn1|

    .. |hn1| replace:: Wm\ :sup:`-1` K\ :sup:`-1` *x*\ :sup:`-1`

    Here *x* is the size of the bins in THz. For example, if there are 4 bins per THz, *x* = 0.25 THz.
    """
    km_path = __get_path(directory, inputfile)
    out_path = __get_path(directory, outputfile)
    data = __modal_analysis_read(nbins, nsamples, km_path, ndiv, multiprocessing, ncore, block_size)
    out = dict()
    directions = __get_direction(directions)
    if 'x' in directions:
        out['kmxi'] = data[:, :, 0]
        out['kmxo'] = data[:, :, 1]
    if 'y' in directions:
        out['kmyi'] = data[:, :, 2]
        out['kmyo'] = data[:, :, 3]
    if 'z' in directions:
        out['kmz'] = data[:, :, 4]

    out['nbins'] = nbins
    out['nsamples'] = nsamples

    if save:
        np.save(out_path, out)

    if return_data:
        return out
    return


[docs]def load_saved_kappamode(filename='kappamode.npy', directory=None):
    """
    Loads data saved by the 'load_kappamode' function and returns the original dictionary.

    Args:
        filename (str):
            Name of the file to load

        directory (str):
            Directory the data file is located in

    Returns:
        dict: Dictionary with all modal thermal conductivities previously requested

    """
    path = __get_path(directory, filename)
    return np.load(path, allow_pickle=True).item()


[docs]def load_saved_heatmode(filename='heatmode.npy', directory=None):
    """
    Loads data saved by the 'load_heatmode' or 'get_gkma_kappa' function and returns the original dictionary.

    Args:
        filename (str):
            Name of the file to load

        directory (str):
            Directory the data file is located in

    Returns:
        dict: Dictionary with all modal heat flux previously requested

    """

    path = __get_path(directory, filename)
    return np.load(path, allow_pickle=True).item()


[docs]def load_sdc(Nc, directory=None, filename='sdc.out'):
    """
    Loads data from sdc.out GPUMD output file.

    Args:
        Nc (int or list(int)):
            Number of time correlation points the VAC/SDC is computed for

        directory (str):
            Directory to load 'sdc.out' file from (dir. of simulation)

        filename (str):
            File to load SDC from

    Returns:
        dict(dict):
            Dictonary with SDC/VAC data. The outermost dictionary stores each individual run

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,t,VACx,VACy,VACz,SDCx,SDCy,SDCz
       **units**,ps,|sd1|,|sd1|,|sd1|,|sd2|,|sd2|,|sd2|

    .. |sd1| replace:: A\ :sup:`2` ps\ :sup:`-2`
    .. |sd2| replace:: A\ :sup:`2` ps\ :sup:`-1`
    """
    Nc = __check_list(Nc, varname='Nc', dtype=int)
    sdc_path = __get_path(directory, filename)
    data = pd.read_csv(sdc_path, delim_whitespace=True, header=None)
    __check_range(Nc, data.shape[0])
    labels = ['t', 'VACx', 'VACy', 'VACz', 'SDCx', 'SDCy', 'SDCz']
    return __basic_reader(Nc, data, labels)


[docs]def load_vac(Nc, directory=None, filename='mvac.out'):
    """
    Loads data from mvac.out GPUMD output file.

    Args:
        Nc (int or list(int)):
            Number of time correlation points the VAC is computed for

        directory (str):
            Directory to load 'mvac.out' file from

        filename (str):
            File to load VAC from

    Returns:
        dict(dict):
            Dictonary with VAC data. The outermost dictionary stores each individual run

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,t,VACx,VACy,VACz
       **units**,ps,|v1|,|v1|,|v1|

    .. |v1| replace:: A\ :sup:`2` ps\ :sup:`-2`
    """
    Nc = __check_list(Nc, varname='Nc', dtype=int)
    sdc_path = __get_path(directory, filename)
    data = pd.read_csv(sdc_path, delim_whitespace=True, header=None)
    __check_range(Nc, data.shape[0])
    labels = ['t', 'VACx', 'VACy', 'VACz']
    return __basic_reader(Nc, data, labels)


[docs]def load_dos(num_dos_points, directory=None, filename='dos.out'):
    """
    Loads data from dos.out GPUMD output file.

    Args:
        num_dos_points (int or list(int)):
            Number of frequency points the DOS is computed for.

        directory (str):
            Directory to load 'dos.out' file from (dir. of simulation)

        filename (str):
            File to load DOS from.

    Returns:
        dict(dict)): Dictonary with DOS data. The outermost dictionary stores
        each individual run.

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,nu,DOSx,DOSy,DOSz
       **units**,THz,|d1|,|d1|,|d1|

    .. |d1| replace:: THz\ :sup:`-1`

    """
    num_dos_points = __check_list(num_dos_points, varname='num_dos_points', dtype=int)
    dos_path = __get_path(directory, filename)
    data = pd.read_csv(dos_path, delim_whitespace=True, header=None)
    __check_range(num_dos_points, data.shape[0])
    labels = ['nu', 'DOSx', 'DOSy', 'DOSz']
    out = __basic_reader(num_dos_points, data, labels)
    for key in out.keys():
        out[key]['nu'] /= (2 * np.pi)
    return out


[docs]def load_shc(Nc, num_omega, directory=None, filename='shc.out'):
    """
    Loads the data from shc.out GPUMD output file.

    Args:
        Nc (int or list(int)):
            Maximum number of correlation steps. If multiple shc runs, can provide a list of Nc.

        num_omega (int or list(int)):
            Number of frequency points. If multiple shc runs, can provide a list of num_omega.

        directory (str):
            Directory to load 'shc.out' file from (dir. of simulation)

        filename (str):
            File to load SHC from.

    Returns:
        dict: Dictionary of in- and out-of-plane shc results (average)

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,t, Ki, Ko, nu, jwi, jwo
       **units**,ps, |sh1|,|sh1|, THz, |sh2|, |sh2|

    .. |sh1| replace:: A eV ps\ :sup:`-1`
    .. |sh2| replace:: A eV ps\ :sup:`-1` THz\ :sup:`-1`
    """

    Nc = __check_list(Nc, varname='Nc', dtype=int)
    num_omega = __check_list(num_omega, varname='num_omega', dtype=int)
    if not len(Nc) == len(num_omega):
        raise ValueError('Nc and num_omega must be the same length.')
    shc_path = __get_path(directory, filename)
    data = pd.read_csv(shc_path, delim_whitespace=True, header=None)
    __check_range(np.array(Nc) * 2 - 1 + np.array(num_omega), data.shape[0])
    if not all([i>0 for i in Nc]) or not all([i>0 for i in num_omega]):
        raise ValueError('Only strictly positive numbers are allowed.')
    labels_corr = ['t', 'Ki', 'Ko']
    labels_omega = ['nu', 'jwi', 'jwo']

    out = dict()
    start = 0
    for i, varlen in enumerate(zip(Nc, num_omega)):
        run = dict()
        Nc_i = varlen[0] * 2 - 1
        num_omega_i = varlen[1]
        end = start + Nc_i
        for j, key in enumerate(labels_corr):
            run[key] = data[j][start:end].to_numpy(dtype='float')
        start = end
        end += num_omega_i
        for j, key in enumerate(labels_omega):
            run[key] = data[j][start:end].to_numpy(dtype='float')
        run['nu'] /= (2 * np.pi)
        start = end
        out['run{}'.format(i)] = run
    return out


[docs]def load_kappa(directory=None, filename='kappa.out'):
    """
    Loads data from kappa.out GPUMD output file which contains HNEMD kappa.

    Args:
        directory (str):
            Directory containing kappa data file

        filename (str):
            The kappa data file

    Returns:
        dict: A dictionary with keys corresponding to the columns in 'kappa.out'

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,kxi, kxo, kyi, kyo, kz
       **units**,|k1|,|k1|,|k1|,|k1|,|k1|

    .. |k1| replace:: Wm\ :sup:`-1` K\ :sup:`-1`

    """

    kappa_path = __get_path(directory, filename)
    data = pd.read_csv(kappa_path, delim_whitespace=True, header=None)
    labels = ['kxi', 'kxo', 'kyi', 'kyo', 'kz']
    out = dict()
    for i, key in enumerate(labels):
        out[key] = data[i].to_numpy(dtype='float')
    return out


[docs]def load_hac(Nc, output_interval, directory=None,filename='hac.out'):
    """
    Loads data from hac.out GPUMD output file.

    Args:
        Nc (int or list(int)):
            Number of correlation steps

        output_interval (int or list(int)):
            Output interval for HAC and RTC data

        directory (str):
            Directory containing hac data file

        filename (str):
            The hac data file

    Returns:
        dict: A dictionary containing the data from hac runs

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,t, kxi, kxo, kyi, kyo, kz, jxijx, jxojx, jyijy, jyojy, jzjz
       **units**,ps,|h1|,|h1|,|h1|,|h1|,|h1|,|h2|,|h2|,|h2|,|h2|,|h2|

    .. |h1| replace:: Wm\ :sup:`-1` K\ :sup:`-1`
    .. |h2| replace:: eV\ :sup:`3` amu\ :sup:`-1`
    """

    Nc = __check_list(Nc, varname='Nc', dtype=int)
    output_interval = __check_list(output_interval, varname='output_interval', dtype=int)
    if not len(Nc) == len(output_interval):
        raise ValueError('Nc and output_interval must be the same length.')

    npoints = [int(x / y) for x, y in zip(Nc, output_interval)]
    hac_path = __get_path(directory, filename)
    data = pd.read_csv(hac_path, delim_whitespace=True, header=None)
    __check_range(npoints, data.shape[0])
    labels = ['t', 'jxijx', 'jxojx', 'jyijy', 'jyojy', 'jzjz',
              'kxi', 'kxo', 'kyi', 'kyo', 'kz']
    start = 0
    out = dict()
    for i, varlen in enumerate(npoints):
        end = start + varlen
        run = dict()
        for j, key in enumerate(labels):
            run[key] = data[j][start:end].to_numpy(dtype='float')
        start = end
        out['run{}'.format(i)] = run
    return out


[docs]def get_frequency_info(bin_f_size, eigfile='eigenvector.out', directory=None):
    """
    Gathers eigen-frequency information from the eigenvector file and sorts
    it appropriately based on the selected frequency bins (identical to
    internal GPUMD representation).

    Args:
        bin_f_size (float):
            The frequency-based bin size (in THz)

        eigfile (str):
            The filename of the eigenvector output/input file created by GPUMD
            phonon package

        directory (str):
            Directory eigfile is stored

    Returns:
        dict: Dictionary with the system eigen-freqeuency information along
        with binning information

    .. csv-table:: Output dictionary
       :stub-columns: 1

       **key**,fq,fmax,fmin,shift,nbins,bin_count,bin_f_size
       **units**,THz,THz,THz,N/A,N/A,N/A,THz
    """
    if not directory:
        eigpath = os.path.join(os.getcwd(), eigfile)
    else:
        eigpath = os.path.join(directory, eigfile)

    with open(eigpath, 'r') as f:
        om2 = [float(x) for x in f.readline().split()]

    fq = np.sign(om2) * np.sqrt(abs(np.array(om2))) / (2 * np.pi)
    fmax = (np.floor(np.abs(fq[-1]) / bin_f_size) + 1) * bin_f_size
    fmin = np.floor(np.abs(fq[0]) / bin_f_size) * bin_f_size
    shift = int(np.floor(np.abs(fmin) / bin_f_size))
    nbins = int(np.floor((fmax - fmin) / bin_f_size))
    bin_count = np.zeros(nbins)
    for freq in fq:
        bin_count[int(np.floor(np.abs(freq) / bin_f_size) - shift)] += 1
    return {'fq': fq, 'fmax': fmax, 'fmin': fmin, 'shift': shift,
            'nbins': nbins, 'bin_count': bin_count, 'bin_f_size': bin_f_size}


[docs]def reduce_frequency_info(freq, ndiv=1):
    """
    Recalculates frequency binning information based on how many times larger bins are wanted.

    Args:
        freq (dict): Dictionary with frequency binning information from the get_frequency_info function output

        ndiv (int):
            Integer used to shrink number of bins output. If originally have 10 bins, but want 5, ndiv=2. nbins/ndiv
            need not be an integer

    Returns:
        dict: Dictionary with the system eigen freqeuency information along with binning information

    """
    freq = copy.deepcopy(freq)
    freq['bin_f_size'] = freq['bin_f_size'] * ndiv
    freq['fmax'] = (np.floor(np.abs(freq['fq'][-1]) / freq['bin_f_size']) + 1) * freq['bin_f_size']
    nbins_new = int(np.ceil(freq['nbins'] / ndiv))
    npad = nbins_new * ndiv - freq['nbins']
    freq['nbins'] = nbins_new
    freq['bin_count'] = np.pad(freq['bin_count'], [(0, npad)])
    freq['bin_count'] = np.sum(freq['bin_count'].reshape(-1, ndiv), axis=1)
    freq['ndiv'] = ndiv
    return freq