from aguaclara.core.units import u
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import os
from pathlib import Path
[docs]def column_of_data(path, start, column, end=None, units=""):
"""This function extracts a column of data from a ProCoDA data file.
Note: Column 0 is time. The first data column is column 1.
:param path: The file path of the ProCoDA data file
:type path: string
:param start: Index of first row of data to extract, inclusive
:type start: int
:param end: Index of last row of data to extract until, exclusive. Defaults to extracting all rows.
:type end: int, optional
:param column: Index or label of the column that you want to extract
:type column: int or string
:param units: The units you want to apply to the data, e.g. 'mg/L'. Defaults to "" (dimensionless).
:type units: string, optional
:return: The column of data
:rtype: numpy.ndarray in units of [units]
:Examples:
.. code-block:: python
data = column_of_data("Reactor_data.txt", 0, 1, -1, "mg/L")
"""
df = pd.read_csv(path, delimiter='\t')
if isinstance(column, int):
data = df.iloc[start:end, column]
else:
data = df.iloc[start:end][column]
num_data = data[pd.to_numeric(data, errors='coerce').notnull()]
return np.array(num_data) * u(units)
[docs]def column_of_time(path, start, end=None, units="day"):
"""This function extracts the column of times as elasped times from a ProCoDA data file.
:param path: The file path of the ProCoDA data file.
:type path: string
:param start: Index of first row of data to extract from the data file
:type start: int
:param end: Index of last row of data to extract from the data. Defaults to last row
:type end: int
:param units: The return type units, which defaults to day.
:type units: string, optional
:return: Experimental times starting at 0
:rtype: numpy.ndarray in units of days or hours, specified with units parameter
:Examples:
.. code-block:: python
time = column_of_time("Reactor_data.txt", 0)
"""
df = pd.read_csv(path, delimiter='\t')
start_time = pd.to_numeric(df.iloc[start, 0])
day_times = df.iloc[start:end, 0]
is_numeric = pd.to_numeric(day_times, errors='coerce').notnull()
num_day_times = pd.to_numeric(day_times[is_numeric])
elapsed_times = num_day_times - start_time
return (np.array(elapsed_times) * u.day).to(u(units))
[docs]def plot_columns(path, columns, x_axis=None):
"""Plot columns of data, located by labels, in the given data file.
:param path: The file path of the ProCoDA data file
:type path: string
:param columns: A single column label or list of column labels
:type columns: string or string list
:param x_axis: The label of the x-axis column (defaults to None)
:type x_axis: string, optional
:return: A list of Line2D objects representing the plotted data
:rtype: matplotlib.lines.Line2D list
"""
df = pd.read_csv(path, delimiter='\t')
df = remove_notes(df)
if isinstance(columns, str):
y = pd.to_numeric(df.loc[:, columns])
if x_axis is None:
plt.plot(y)
else:
x = pd.to_numeric(df.loc[:, x_axis])
plt.plot(x, y)
elif isinstance(columns, list):
for c in columns:
y = pd.to_numeric(df.loc[:, c])
if x_axis is None:
plt.plot(y)
else:
x = pd.to_numeric(df.loc[:, x_axis])
plt.plot(x, y)
else:
raise ValueError('columns must be a string or list of strings')
[docs]def iplot_columns(path, columns, x_axis=None):
"""Plot columns of data, located by indexes, in the given data file.
:param path: The file path of the ProCoDA data file
:type path: string
:param columns: A single column index or list of column indexes
:type columns: int or int list
:param x_axis: The index of the x-axis column (defaults to None)
:type x_axis: int, optional
:param sep: The separator or delimiter, of the data file. Use ',' for CSV's, '\t' for TSV's.
:type sep: string
:return: a list of Line2D objects representing the plotted data
:rtype: matplotlib.lines.Line2D list
"""
df = pd.read_csv(path, delimiter='\t')
df = remove_notes(df)
if isinstance(columns, int):
y = pd.to_numeric(df.iloc[:, columns])
if x_axis is None:
plt.plot(y)
else:
x = pd.to_numeric(df.iloc[:, x_axis])
plt.plot(x, y)
elif isinstance(columns, list):
for c in columns:
y = pd.to_numeric(df.iloc[:, c])
if x_axis is None:
plt.plot(y)
else:
x = pd.to_numeric(df.iloc[:, x_axis])
plt.plot(x, y)
else:
raise ValueError('columns must be an int or a list of ints')
[docs]def notes(path):
"""This function extracts any experimental notes from a ProCoDA data file.
Use this to identify the section of the data file that you want to extract.
:param path: The file path of the ProCoDA data file.
:type path: string
:return: The rows of the data file that contain text notes inserted during the experiment.
:rtype: pandas.Dataframe
"""
df = pd.read_csv(path, delimiter='\t')
return df[pd.to_numeric(df.iloc[:, 0], errors='coerce').isnull()]
[docs]def remove_notes(data):
"""Omit notes from a DataFrame object, where notes are identified as rows
with non-numerical entries in the first column.
:param data: DataFrame object to remove notes from
:type data: Pandas.DataFrame
:return: DataFrame object with no notes
:rtype: Pandas.DataFrame
"""
return data[pd.to_numeric(data.iloc[:, 0], errors='coerce').notnull()]
[docs]def get_data_by_time(path, columns, dates, start_time='00:00', end_time='23:59',
extension='.tsv', units='', elapsed=False):
"""Extract columns of data over one or more ProCoDA data files based on date
and time. Valid only for files whose names are automatically generated by
date, i.e. of the form "datalog_M-D-YYYY".
Note: Column 0 is time. The first data column is column 1. Results for the
time column are adjusted for multi-day experiments.
:param path: The path to the folder containing the ProCoDA data file(s)
:type path: string
:param columns: A single column index or a list of column indexes
:type columns: int or int list
:param dates: A single date or list of dates, formatted "M-D-YYYY"
:type dates: string or string list
:param start_time: Starting time of data to extract, formatted 'HH:MM' (24-hour time)
:type start_time: string, optional
:param end_time: Ending time of data to extract, formatted 'HH:MM' (24-hour time)
:type end_time: string, optional
:param extension: File extension of the data file(s). Defaults to '.tsv'
:type extension: string, optional
:param units: A single unit or list of units to apply to each column, e.g. 'mg/L' or ['hr', 'mg/L']. Defaults to '' (dimensionless).
:type units: string, optional
:param elapsed: If true, results for the time column are given in elapsed time
:type elapsed: boolean
:return: the single column of data or a list of the columns of data (in the order of the indexes given in the columns variable)
:rtype: 1D or 2D float list
:Examples:
.. code-block:: python
data = get_data_by_time(path='/Users/.../ProCoDA Data/', columns=4, dates=['6-14-2018', '6-15-2018'], start_time='12:20', end_time='10:50')
data = get_data_by_time(path='/Users/.../ProCoDA Data/', columns=[0,4], dates='6-14-2018', start_time='12:20', end_time='23:59')
data = get_data_by_time(path='/Users/.../ProCoDA Data/', columns=[0,3,4], dates='6-14-2018')
"""
# the file path url is not acceptable (ie contains 'github.com')
if 'github.com' in path:
path = path.replace('github.com', 'raw.githubusercontent.com')
path = path.replace('blob/', '')
path = path.replace('tree/', '')
data = data_from_dates(path, dates, extension) # combine data from each date
first_time_column = pd.to_numeric(data[0].iloc[:, 0])
start = max(day_fraction(start_time), first_time_column[0])
start_idx = (first_time_column >= start).idxmax()
end_idx = (pd.to_numeric(data[-1].iloc[:, 0]) >=
day_fraction(end_time)).idxmax() + 1
if isinstance(columns, int):
if columns == 0 and elapsed:
col = column_start_to_end(data, columns, start_idx, end_idx)
result = list(np.subtract(col, start))*u(units)
else:
result = column_start_to_end(data, columns, start_idx, end_idx)*u(units)
else: # columns is a list
if units == '':
units = ['']*len(columns)
result = []
i = 0
for c in columns:
if c == 0 and elapsed:
col = column_start_to_end(data, c, start_idx, end_idx)
result.append(list(np.subtract(col, start))*u(units[i]))
else:
result.append(column_start_to_end(data, c, start_idx, end_idx)*u(units[i]))
i += 1
return result
def day_fraction(time):
"""Convert a 24-hour time to a fraction of a day. For example, midnight
corresponds to 0.0, and noon to 0.5.
:param time: Time in the form of 'HH:MM' (24-hour time)
:type time: string
:return: A day fraction
:rtype: float
:Examples:
.. code-block:: python
day_fraction("18:30")
"""
hour = int(time.split(":")[0])
minute = int(time.split(":")[1])
return hour/24 + minute/1440
def data_from_dates(path, dates, extension):
"""Return a list of DataFrames representing the ProCoDA data files stored in
the given path and recorded on the given dates.
:param path: The path to the folder containing the ProCoDA data file(s)
:type path: string
:param dates: A single date or list of dates for which data was recorded, formatted "M-D-YYYY"
:type dates: string or string list
:param extension: File extension of the data file(s)
:type extension: string, optional
:return: a list of DataFrames representing the ProCoDA data files recorded on the given dates
:rtype: pandas.DataFrame list
"""
if not isinstance(dates, list):
dates = [dates]
data = []
for d in dates:
filepath = os.path.join(path, 'datalog_' + d + extension)
data.append(remove_notes(pd.read_csv(filepath, delimiter='\t')))
return data
def column_start_to_end(data, column, start_idx, end_idx):
"""Return a list of numeric data entries in the given column from the starting
index to the ending index. This can list can be compiled over one or more
DataFrames.
:param data: a list of DataFrames to extract one column from
:type data: Pandas.DataFrame list
:param column: a column index
:type column: int
:param start_idx: the index of the starting row of the first DataFrame
:type start_idx: int
:param end_idx: the index of the ending row of the last DataFrame, excluding this row
:type end_idx: int
:return: a list of data from the given column
:rtype: float list
"""
if len(data) == 1:
result = list(pd.to_numeric(data[0].iloc[start_idx:end_idx, column]))
else:
result = list(pd.to_numeric(data[0].iloc[start_idx:, column]))
for i in range(1, len(data)-1):
data[i].iloc[0, 0] = 0
result += list(pd.to_numeric(data[i].iloc[:, column]) +
(i if column == 0 else 0))
# assuming DataFrames are for consecutive days, add number of
# DataFrame if dealing with the time column (column 0)
data[-1].iloc[0, 0] = 0
result += list(pd.to_numeric(data[-1].iloc[:end_idx, column]) +
(len(data)-1 if column == 0 else 0))
return result
[docs]def get_data_by_state(path, dates, state, column, extension=".tsv"):
"""Reads a ProCoDA file and extracts the time and data column for each
iteration of the given state.
Note: column 0 is time, the first data column is column 1. Results for the
time column are given in elasped time.
:param path: The path to the folder containing the ProCoDA data file(s), defaults to the current directory
:type path: string
:param dates: A single date or list of dates for which data was recorded, formatted "M-D-YYYY"
:type dates: string or string list
:param state: The state ID number for which data should be extracted
:type state: int
:param column: The integer index of the column that you want to extract OR the header of the column that you want to extract
:type column: int or string
:param extension: File extension of the data file(s). Defaults to '.tsv'
:type extension: string, optional
:return: A list of lists of the time and data columns extracted for each iteration of the state. For example, if "data" is the output, data[i][:,0] gives the time column and data[i][:,1] gives the data column for the ith iteration of the given state and column. data[i][0] would give the first [time, data] pair.
:type: 3D float list
:Examples:
.. code-block:: python
data = get_data_by_state(path='/Users/.../ProCoDA Data/', dates=["6-19-2013", "6-20-2013"], state=1, column=28)
"""
# the file path url is not acceptable (ie contains 'github.com')
if 'github.com' in path:
path = path.replace('github.com', 'raw.githubusercontent.com')
path = path.replace('blob/', '')
path = path.replace('tree/', '')
data_agg = []
day = 0
first_day = True
overnight = False
if path[-1] != '/':
path += '/'
if not isinstance(dates, list):
dates = [dates]
for d in dates:
state_file = path + "statelog_" + d + extension
data_file = path + "datalog_" + d + extension
states = pd.read_csv(state_file, delimiter='\t')
data = pd.read_csv(data_file, delimiter='\t')
states = np.array(states)
data = np.array(data)
# get the start and end times for the state
state_start_idx = states[:, 1] == state
state_start = states[state_start_idx, 0]
state_end_idx = np.append([False], state_start_idx[0:-1])
state_end = states[state_end_idx, 0]
if overnight:
state_start = np.insert(state_start, 0, 0)
state_end = np.insert(state_end, 0, states[0, 0])
if state_start_idx[-1]:
state_end = np.append(state_end, data[-1, 0])
# get the corresponding indices in the data array
data_start = []
data_end = []
for i in range(np.size(state_start)):
data_start.append((data[:, 0] > state_start[i]).argmax())
data_end.append((data[:, 0] > state_end[i]).argmax()-1)
if np.size(data_end) < np.size(data_start):
data_end = np.append(data_end, -1)
if first_day:
start_time = data[0, 0]
for i in range(np.size(data_start)):
t = data[data_start[i]:data_end[i], 0] + day - start_time
if isinstance(column, int):
c = data[data_start[i]:data_end[i], column]
else:
c = data[column][data_start[i]:data_end[i]]
if overnight and i == 0:
data_agg = np.insert(data_agg[-1], np.size(data_agg[-1][:, 0]),
np.vstack((t, c)).T)
else:
data_agg.append(np.vstack((t, c)).T)
day += 1
if first_day:
first_day = False
if state_start_idx[-1]:
overnight = True
return data_agg
[docs]def read_state(dates, state, column, units="", path="", extension=".tsv"):
"""Reads a ProCoDA file and outputs the data column and time vector for
each iteration of the given state.
Note: Column 0 is time. The first data column is column 1.
:param dates: A single date or list of dates for which data was recorded, formatted "M-D-YYYY"
:type dates: string or string list
:param state: The state ID number for which data should be extracted
:type state: int
:param column: Index of the column that you want to extract OR header of the column that you want to extract
:type column: int or string
:param units: The units you want to apply to the data, e.g. 'mg/L'. Defaults to "" (dimensionless)
:type units: string, optional
:param path: The file path of the ProCoDA data file.
:type path: string
:param extension: The file extension of the tab delimited file. Defaults to ".tsv"
:type extension: string, optional
:return: time (numpy.ndarray) - Times corresponding to the data (with units)
:return: data (numpy.ndarray) - Data in the given column during the given state with units
:Examples:
.. code-block:: python
time, data = read_state(["6-19-2013", "6-20-2013"], 1, 28, "mL/s")
"""
data_agg = get_data_by_state(path, dates, state, column, extension)
data_agg = np.vstack(data_agg)
if units != "":
return data_agg[:, 0]*u.day, data_agg[:, 1]*u(units)
else:
return data_agg[:, 0]*u.day, data_agg[:, 1]
[docs]def average_state(dates, state, column, units="", path="", extension=".tsv"):
"""Outputs the average value of the data for each instance of a state in
the given ProCoDA files
Note: Column 0 is time. The first data column is column 1.
:param dates: A single date or list of dates for which data was recorded, formatted "M-D-YYYY"
:type dates: string or string list
:param state: The state ID number for which data should be extracted
:type state: int
:param column: Index of the column that you want to extract OR header of the column that you want to extract
:type column: int or string
:param units: The units you want to apply to the data, e.g. 'mg/L'. Defaults to "" (dimensionless)
:type units: string, optional
:param path: The file path of the ProCoDA data file.
:type path: string
:param extension: The file extension of the tab delimited file. Defaults to ".tsv"
:type extension: string, optional
:return: A list of averages for each instance of the given state
:rtype: float list
:Examples:
.. code-block:: python
data_avgs = average_state(["6-19-2013", "6-20-2013"], 1, 28, "mL/s")
"""
data_agg = get_data_by_state(path, dates, state, column, extension)
averages = np.zeros(np.size(data_agg))
for i in range(np.size(data_agg)):
averages[i] = np.average(data_agg[i][:,1])
if units != "":
return averages*u(units)
else:
return averages
[docs]def write_calculations_to_csv(funcs, states, columns, path, headers, out_name,
metaids=[], extension=".tsv"):
"""Writes each output of the given functions on the given states and data
columns to a new column in the specified output file.
Note: Column 0 is time. The first data column is column 1.
:param funcs: A function or list of functions which will be applied in order to the data. If only one function is given it is applied to all the states/columns
:type funcs: function or function list
:param states: The state ID numbers for which data should be extracted. List should be in order of calculation or if only one state is given then it will be used for all the calculations
:type states: string or string list
:param columns: The index of a column, the header of a column, a list of indexes, OR a list of headers of the column(s) that you want to apply calculations to
:type columns: int, string, int list, or string list
:param path: Path to your ProCoDA metafile (must be tab-delimited)
:type path: string
:param headers: List of the desired header for each calculation, in order
:type headers: string list
:param out_name: Desired name for the output file. Can include a relative path
:type out_name: string
:param metaids: A list of the experiment IDs you'd like to analyze from the metafile
:type metaids: string list, optional
:param extension: The file extension of the tab delimited file. Defaults to ".tsv"
:type extension: string, optional
:requires: funcs, states, columns, and headers are all of the same length if they are lists. Some being lists and some single values are okay.
:return: out_name.csv (CVS file) - A CSV file with the each column being a new calcuation and each row being a new experiment on which the calcuations were performed
:return: output (Pandas.DataFrame)- Pandas DataFrame holding the same data that was written to the output file
"""
if not isinstance(funcs, list):
funcs = [funcs] * len(headers)
if not isinstance(states, list):
states = [states] * len(headers)
if not isinstance(columns, list):
columns = [columns] * len(headers)
data_agg = []
for i in range(len(headers)):
ids, data = read_state_with_metafile(funcs[i], states[i], columns[i],
path, metaids, extension)
data_agg = np.append(data_agg, [data])
output = pd.DataFrame(data=np.vstack((ids, data_agg)).T,
columns=["ID"]+headers)
output.to_csv(out_name, sep='\t')
return output
[docs]def intersect(x, y1, y2):
"""Returns the intersections of two lines represented by a common set of x coordinates and
two sets of y coordinates as three numpy arrays: the x coordinates of the intersections,
the y coordinates of the intersections, and the indexes in x, y1, y2 immediately
after the intersections.
:param x: common set of x coordinates for the two lines
:type x: numpy.ndarray
:param y1: the y coordinates of the first line
:type y1: numpy.ndarray
:param y2: the y coordinates of the second line
:type y2: numpy.ndarray
:requires: x have no repeating values and is in ascending order
:return: x_points-numpy.ndarray of the x coordinates where intersections occur
:return: y_points-numpy.ndarray of the y coordinates where intersections occur
:return: crossings-numpy.ndarray of the indexes after the intersections occur
"""
x_points = np.array([])
y_points = np.array([])
crossings = (np.argwhere(np.diff(np.sign(y1-y2)))+1).flatten()
for c in crossings:
slope1 = (y1[c] - y1[c-1]) / (x[c] - x[c-1])
slope2 = (y2[c] - y2[c-1]) / (x[c] - x[c-1])
b1 = y1[c] - slope1 * x[c]
b2 = y2[c] - slope2 * x[c]
x_points = np.append(x_points, (b2-b1)/(slope1-slope2))
y_points = np.append(y_points, slope1*(b2-b1)/(slope1-slope2) + b1)
return x_points, y_points, crossings