import numpy as np
import pandas as pd

import matplotlib.pylab as plt
from IPython.display import Image


%matplotlib inline

#Samples
samples = pd.read_csv("http://monhegan.unh.edu/ppforest/global_PP_samples.csv")
sampstr = "insitu samples"

#subset
subset  = pd.read_csv("http://monhegan.unh.edu/ppforest/global_PP_subset.csv")
globstr = "global subset"

plt.clf()
mask = samples["test"]
fig,axes = plt.subplots(1,2, sharey=True, sharex=True)

ax = axes[0] 
ax.scatter(samples["pp_obs"][~mask], samples["pp_xgb"][~mask],3, label="test")
ax.scatter(samples["pp_obs"][mask], samples["pp_xgb"][mask],3, label="train")
ax.set_xlabel("Observed PP")
ax.set_ylabel("Predicted PP")
ax.set_title("XGBoost")
ax.set_aspect(1)
ax.set_xlim(-9,8)
ax.set_ylim(-9,8)
ax.plot([-9,8], [-9,8], ":", c="0.5")
ax.legend()

ax = axes[1] 
ax.scatter(samples["pp_obs"][~mask], samples["pp_rf"][~mask],3, label="test")
ax.scatter(samples["pp_obs"][mask], samples["pp_rf"][mask],3, label="train")
ax.set_aspect(1)
ax.set_title("Random Forest")

Text(0.5, 1.0, 'Random Forest')

<Figure size 640x480 with 0 Axes>

plt.clf()
plt.scatter(samples["pp_xgb"], samples["pp_rf"],3, label=sampstr)
plt.scatter(subset["pp_xgb"], subset["pp_rf"],3, label=globstr)
plt.xlabel("Xgboost log(PP)")
plt.ylabel("Random Forest log(PP)")
plt.gca().set_aspect(1)
plt.xlim(-9,8)
plt.ylim(-9,8)
plt.plot([-9,8], [-9,8], ":", c="0.5")
plt.legend()

<matplotlib.legend.Legend at 0x14a9f6ad0>

print(f" Sum of PP from Random Forest: {np.sum(np.exp(subset["pp_rf"]))}")
print(f" Sum of PP from XGBoost:       {np.sum(np.exp(subset["pp_xgb"]))}")

 Sum of PP from Random Forest: 68365.27490220523
 Sum of PP from XGBoost:       126784.17463216199

plt.clf()
plt.hist(samples.Zeu, np.linspace(0,200,25), alpha=0.5, label=sampstr)
plt.hist(subset.Zeu,  np.linspace(0,200,25), alpha=0.5, label=globstr)
plt.xlabel("Zeu (m)")
plt.legend()

<matplotlib.legend.Legend at 0x14acc3250>

plt.clf()
plt.hist(samples.sst, np.linspace(-5,35,25), alpha=0.5, label=sampstr)
plt.hist(subset.sst,  np.linspace(-5,35,25), alpha=0.5, label=globstr)
plt.xlabel("SST (°C)")
plt.legend()

<matplotlib.legend.Legend at 0x14afaf4d0>

plt.clf()
plt.hist(samples.chl, np.linspace(-4,4,25), alpha=0.5, label=sampstr)
plt.hist(np.log(subset.chl),  np.linspace(-4,4,25), alpha=0.5, label=globstr)
plt.xlabel("Chl (mg/m3)")
plt.legend()

<matplotlib.legend.Legend at 0x14aa8f9d0>

plt.clf()
plt.hist(samples.sat_par, np.linspace(0,70,25), alpha=0.5, label=sampstr)
plt.hist(subset.sat_par,  np.linspace(0,70,25), alpha=0.5, label=globstr)
plt.xlabel("PAR")
plt.legend()

<matplotlib.legend.Legend at 0x14ab6fed0>

Datasets¶

Load the datasets:¶