# (C) British Crown Copyright 2017, Met Office
#
# This code is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This code is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# Functions for easy access to DWR observations.
import os
import os.path
import datetime
import pandas
import pkg_resources
def _get_data_file_name(variable,year,month):
"""Return the name of the file containing data for the
requested variable, at the specified time."""
fname="data/%04d/%02d/%s.txt" % (year,month,variable)
name=pkg_resources.resource_filename(__name__, fname)
return name
[docs]def load_observations_1file(variable,year,month):
"""Load all observations for one calendar month (for one variable)
Data must be available in directory ../../data.
Args:
variable (:obj:`str`): Variable name ('prmsl','air.2m', 'prate', etc.)
year (:obj:`int`): Year of assimilation run.
month (:obj:`int`): Month of assimilation run (1-12)
Returns:
:obj:`pandas.DataFrame`: Dataframe of observations.
Raises:
IOError: No data on disc for this variable, year, and month.
|
"""
of_name=_get_data_file_name(variable,year,month)
if not os.path.isfile(of_name):
raise IOError("No obs file for %s on %04d/%02d" % (variable,year,month))
o=pandas.read_table(of_name,sep='\s+',
header=None,
encoding="ascii",
names=['year','month','day','hour','minute',
'latitude','longitude',
'value',
'name'],
converters={'year': int, 'month': int, 'day' : int,
'hour': int, 'minute': int,
'latitude': float,'longitude': float,
'value': float,
'name': str},
na_values=['NA'],
comment=None)
# Add the datetime
o=o.assign(dtm=pandas.to_datetime(o[['year','month','day','hour','minute']]))
return o
[docs]def load_observations(variable,start,end):
"""Load observations from disc, for the selected period
Data must be available in directory $SCRATCH/DWR. Requires a data file for all calendar months in the period start to end.
Args:
variable (:obj:`str`): Variable name ('prmsl','air.2m', 'prate', etc.)
start (:obj:`datetime.datetime`): Get observations at or after this time.
end (:obj:`datetime.datetime`): Get observations before this time.
Returns:
:obj:`pandas.DataFrame`: Dataframe of observations.
Raises:
IOError: No data on disc for this variable, for a year and month in the selected period.
|
"""
result=None
ct=start
while(ct<end):
o=load_observations_1file(variable,ct.year,ct.month)
o2=o[(o.dtm>=start) & (o.dtm<end)]
if(result is None):
result=o2
else:
result=pandas.concat([result,o2])
ct=_add_one_month(ct)
return result
# Want to move a datetime to the beginning of the next month
def _add_one_month(dt0):
dt1 = dt0.replace(day=1)
dt2 = dt1 + datetime.timedelta(days=32)
dt3 = dt2.replace(day=1)
dt3 = dt3.replace(hour=0)
return dt3
# Get the observation at the station interpolated to the desired time
[docs]def at_station_and_time(obs,station,dte):
"""Get, from these observations, the value at the selected station and time.
Typically there are observations from each station only twice a day (sometimes less) to get the observed value at the specified time we do linear interpolation in time (using only observations for the selected station.
Args:
obs (:obj:`pandas.DataFrame`): Batch of observations for the period around the desired time. Probably from :func:`load_observations`.
station (:obj:`str`): Name of station, as used in obs.name.
dte (:obj:`datetime.datetime`): Time of required observed value.
Returns:
:obj:`float`: Interpolated observed value from station at time.
Raises:
StandardError: obs does not contain at least two values for selected station, one before and one after specified time. So interpolation not possible.
|
"""
at_station=obs.loc[obs['name']==station]
if at_station.empty:
raise Exception('No data for station %s' % station)
at_station=at_station.sort_values(by='dtm',ascending=True)
hit=at_station.loc[at_station['dtm']==dte]
if not hit.empty:
return hit['value'].values[0]
before=at_station.loc[at_station['dtm']<dte]
if before.empty:
raise Exception('No data for station %s before %s' % (station,
dte.strftime("%Y-%m-%d:%H:%M")))
before=before.iloc[-1] # last row
after=at_station.loc[at_station['dtm']>dte]
if after.empty:
raise Exception('No data for station %s after %s' % (station,
dte.strftime("%Y-%m-%d:%H:%M")))
after=after.iloc[0] # first row
weight=((dte-before['dtm']).total_seconds()/
(after['dtm']-before['dtm']).total_seconds())
return after['value']*weight+before['value']*(1-weight)
# Interpolation can introduce some big errors - add an indicator of
# how far in time this is from a real observation
[docs]def at_station_and_time_with_distance(obs,station,dte):
"""Get, from these observations, the value at the selected station and time, along with the time gap (in seconds) between the given interpolated value and a real observation.
Typically there are observations from each station only twice a day (sometimes less) to get the observed value at the specified time we do linear interpolation in time (using only observations for the selected station.
Args:
obs (:obj:`pandas.DataFrame`): Batch of observations for the period around the desired time. Probably from :func:`load_observations`.
station (:obj:`str`): Name of station, as used in obs.name.
dte (:obj:`datetime.datetime`): Time of required observed value.
Returns:
:obj:`float`: Interpolated observed value from station at time.
Raises:
StandardError: obs does not contain at least two values for selected station, one before and one after specified time. So interpolation not possible.
|
"""
at_station=obs.loc[obs['name']==station]
if at_station.empty:
raise Exception('No data for station %s' % station)
at_station=at_station.sort_values(by='dtm',ascending=True)
hit=at_station.loc[at_station['dtm']==dte]
if not hit.empty:
return [hit['value'].values[0],0]
before=at_station.loc[at_station['dtm']<dte]
if before.empty:
raise Exception('No data for station %s before %s' % (station,
dte.strftime("%Y-%m-%d:%H:%M")))
before=before.iloc[-1] # last row
after=at_station.loc[at_station['dtm']>dte]
if after.empty:
raise Exception('No data for station %s after %s' % (station,
dte.strftime("%Y-%m-%d:%H:%M")))
after=after.iloc[0] # first row
weight=((dte-before['dtm']).total_seconds()/
(after['dtm']-before['dtm']).total_seconds())
gap=min((dte-before['dtm']).total_seconds(),
(after['dtm']-dte).total_seconds())
return [after['value']*weight+before['value']*(1-weight),gap]
# Get the position of a named station
[docs]def get_station_location(obs,station):
"""Get, from these observations, the location (lat and lon) of the named station.
Args:
obs (:obj:`pandas.DataFrame`): Batch of observations. Probably from :func:`load_observations`.
station (:obj:`str`): Name of station, as used in obs.name.
Returns:
:obj:`dict`: Dictionary with keys 'latitude' and 'longitude'.
Raises:
StandardError: obs does not contain any observations for selected station.
|
"""
at_station=obs.loc[obs['name']==station]
if at_station.empty:
raise Exception('No data for station %s' % station)
result={'latitude': at_station['latitude'].values[0],
'longitude':at_station['longitude'].values[0]}
return result