]> git.angelumana.com Git - irma-modeling/.git/commitdiff
refined CV methodology + selected preferred model master origin/HEAD origin/master
authoribidyouadu <angel.d.umana@gmail.com>
Mon, 26 Oct 2020 08:14:54 +0000 (04:14 -0400)
committeribidyouadu <angel.d.umana@gmail.com>
Mon, 26 Oct 2020 08:14:54 +0000 (04:14 -0400)
irma_modeling.ipynb

index c0d5d96002b5fd5ddcfc8cec924570480aa9fd34..3b718c95e5349d1768d80d5e8e92fa37145c56fb 100644 (file)
@@ -1 +1 @@
-{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"irma_modeling.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true,"mount_file_id":"1Q4J-Du4O02VX-aMhBgnqA8CcQVFolx1x","authorship_tag":"ABX9TyPO/sXn6d3j8rTj+S49aF7/"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"dpQIA2trCnVq"},"source":["# Hurricane Irma Damage Assessment Modeling\n","\n","In this notebook we develop a multilinear and decision tree model to assess damages from hurricane Irma.\n","\n","Our dataset consists of weather, socioeconomic, and Twitter parameters from 49 counties in Florida during Irma. The dependent variable is the amount of federal aid from FEMA that a county received, `dmg`."]},{"cell_type":"code","metadata":{"id":"ShFs31gOCjKO","executionInfo":{"status":"ok","timestamp":1603605566570,"user_tz":240,"elapsed":1032,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"2c6c1ef5-6a1e-4237-e7dd-8b113aecf266","colab":{"base_uri":"https://localhost:8080/","height":1000}},"source":["%cd '/content/drive/My Drive/Colab Notebooks/disaster_assessment/irma_modeling'\n","\n","import pandas as pd\n","import numpy as np\n","from matplotlib import pyplot as plt\n","import seaborn as sns; sns.set()\n","\n","df = pd.read_csv('irma.csv')\n","df.set_index(keys='county',inplace=True)\n","df"],"execution_count":1,"outputs":[{"output_type":"stream","text":["/content/drive/My Drive/Colab Notebooks/disaster_assessment/irma_modeling\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>maxwind_mean</th>\n","      <th>maxwind_peak</th>\n","      <th>precip_total</th>\n","      <th>precip_peak</th>\n","      <th>mhi</th>\n","      <th>poverty_rate</th>\n","      <th>poverty_pop</th>\n","      <th>population</th>\n","      <th>gdp</th>\n","      <th>twt_total</th>\n","      <th>twt_peak</th>\n","      <th>dmg</th>\n","    </tr>\n","    <tr>\n","      <th>county</th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>Alachua County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>45230</td>\n","      <td>21.2</td>\n","      <td>53816</td>\n","      <td>269956</td>\n","      <td>11912080</td>\n","      <td>182</td>\n","      <td>39</td>\n","      <td>9.306765e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Baker County</th>\n","      <td>10.200000</td>\n","      <td>24.1</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>51856</td>\n","      <td>15.3</td>\n","      <td>3900</td>\n","      <td>28355</td>\n","      <td>472948</td>\n","      <td>1</td>\n","      <td>1</td>\n","      <td>1.360526e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Bradford County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>44997</td>\n","      <td>19.1</td>\n","      <td>4533</td>\n","      <td>27732</td>\n","      <td>532487</td>\n","      <td>4</td>\n","      <td>1</td>\n","      <td>3.246248e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Brevard County</th>\n","      <td>14.900000</td>\n","      <td>41.0</td>\n","      <td>4.69</td>\n","      <td>2.25</td>\n","      <td>52596</td>\n","      <td>12.4</td>\n","      <td>72303</td>\n","      <td>596849</td>\n","      <td>20453753</td>\n","      <td>354</td>\n","      <td>71</td>\n","      <td>3.192891e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Broward County</th>\n","      <td>15.680000</td>\n","      <td>49.0</td>\n","      <td>10.44</td>\n","      <td>4.11</td>\n","      <td>56702</td>\n","      <td>13.1</td>\n","      <td>252288</td>\n","      <td>1951260</td>\n","      <td>96591919</td>\n","      <td>2036</td>\n","      <td>382</td>\n","      <td>1.329551e+08</td>\n","    </tr>\n","    <tr>\n","      <th>Charlotte County</th>\n","      <td>17.666667</td>\n","      <td>44.1</td>\n","      <td>1.69</td>\n","      <td>0.76</td>\n","      <td>51583</td>\n","      <td>10.8</td>\n","      <td>19300</td>\n","      <td>184998</td>\n","      <td>3966314</td>\n","      <td>72</td>\n","      <td>20</td>\n","      <td>6.625988e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Citrus County</th>\n","      <td>14.313333</td>\n","      <td>33.0</td>\n","      <td>13.18</td>\n","      <td>5.97</td>\n","      <td>43147</td>\n","      <td>14.4</td>\n","      <td>20654</td>\n","      <td>147929</td>\n","      <td>3335110</td>\n","      <td>67</td>\n","      <td>13</td>\n","      <td>5.568339e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Clay County</th>\n","      <td>10.200000</td>\n","      <td>24.1</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>65375</td>\n","      <td>9.9</td>\n","      <td>20889</td>\n","      <td>216072</td>\n","      <td>3976007</td>\n","      <td>53</td>\n","      <td>8</td>\n","      <td>1.219590e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Collier County</th>\n","      <td>18.866667</td>\n","      <td>63.9</td>\n","      <td>5.12</td>\n","      <td>4.20</td>\n","      <td>66709</td>\n","      <td>11.7</td>\n","      <td>43075</td>\n","      <td>378488</td>\n","      <td>16124953</td>\n","      <td>231</td>\n","      <td>63</td>\n","      <td>6.194133e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Columbia County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>42097</td>\n","      <td>16.5</td>\n","      <td>10720</td>\n","      <td>70503</td>\n","      <td>1826541</td>\n","      <td>14</td>\n","      <td>4</td>\n","      <td>3.225476e+06</td>\n","    </tr>\n","    <tr>\n","      <th>DeSoto County</th>\n","      <td>17.666667</td>\n","      <td>44.1</td>\n","      <td>1.69</td>\n","      <td>0.76</td>\n","      <td>37342</td>\n","      <td>26.1</td>\n","      <td>8766</td>\n","      <td>37489</td>\n","      <td>735286</td>\n","      <td>5</td>\n","      <td>2</td>\n","      <td>4.049290e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Dixie County</th>\n","      <td>13.393333</td>\n","      <td>35.9</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>38355</td>\n","      <td>24.4</td>\n","      <td>3627</td>\n","      <td>16700</td>\n","      <td>178261</td>\n","      <td>7</td>\n","      <td>5</td>\n","      <td>9.079057e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Duval County</th>\n","      <td>15.626667</td>\n","      <td>42.9</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>52105</td>\n","      <td>15.1</td>\n","      <td>138069</td>\n","      <td>950181</td>\n","      <td>60146765</td>\n","      <td>772</td>\n","      <td>137</td>\n","      <td>4.740938e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Flagler County</th>\n","      <td>15.920000</td>\n","      <td>44.1</td>\n","      <td>8.66</td>\n","      <td>4.66</td>\n","      <td>52713</td>\n","      <td>12.0</td>\n","      <td>13137</td>\n","      <td>112067</td>\n","      <td>1809151</td>\n","      <td>63</td>\n","      <td>16</td>\n","      <td>7.243310e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Gilchrist County</th>\n","      <td>13.393333</td>\n","      <td>35.9</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>42880</td>\n","      <td>16.1</td>\n","      <td>2675</td>\n","      <td>18256</td>\n","      <td>254260</td>\n","      <td>3</td>\n","      <td>1</td>\n","      <td>6.418368e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Glades County</th>\n","      <td>14.053333</td>\n","      <td>40.0</td>\n","      <td>9.96</td>\n","      <td>7.09</td>\n","      <td>42865</td>\n","      <td>18.9</td>\n","      <td>2312</td>\n","      <td>13724</td>\n","      <td>171573</td>\n","      <td>4</td>\n","      <td>2</td>\n","      <td>1.684916e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Hamilton County</th>\n","      <td>11.066667</td>\n","      <td>20.0</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>35262</td>\n","      <td>24.0</td>\n","      <td>2791</td>\n","      <td>14310</td>\n","      <td>377309</td>\n","      <td>5</td>\n","      <td>3</td>\n","      <td>7.796505e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Hardee County</th>\n","      <td>18.146667</td>\n","      <td>54.0</td>\n","      <td>2.63</td>\n","      <td>1.36</td>\n","      <td>40056</td>\n","      <td>23.3</td>\n","      <td>6026</td>\n","      <td>27245</td>\n","      <td>893349</td>\n","      <td>8</td>\n","      <td>3</td>\n","      <td>6.791781e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Hendry County</th>\n","      <td>18.866667</td>\n","      <td>63.9</td>\n","      <td>12.61</td>\n","      <td>8.84</td>\n","      <td>38361</td>\n","      <td>23.9</td>\n","      <td>9525</td>\n","      <td>41556</td>\n","      <td>1241872</td>\n","      <td>2</td>\n","      <td>1</td>\n","      <td>4.864095e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Hernando County</th>\n","      <td>14.313333</td>\n","      <td>33.0</td>\n","      <td>13.18</td>\n","      <td>5.97</td>\n","      <td>44710</td>\n","      <td>14.0</td>\n","      <td>25773</td>\n","      <td>190865</td>\n","      <td>3031267</td>\n","      <td>43</td>\n","      <td>12</td>\n","      <td>6.987439e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Highlands County</th>\n","      <td>18.146667</td>\n","      <td>54.0</td>\n","      <td>2.63</td>\n","      <td>1.36</td>\n","      <td>37445</td>\n","      <td>19.8</td>\n","      <td>20051</td>\n","      <td>105424</td>\n","      <td>2088782</td>\n","      <td>46</td>\n","      <td>16</td>\n","      <td>1.104181e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Hillsborough County</th>\n","      <td>10.140000</td>\n","      <td>27.0</td>\n","      <td>5.43</td>\n","      <td>4.50</td>\n","      <td>54741</td>\n","      <td>15.5</td>\n","      <td>214442</td>\n","      <td>1436888</td>\n","      <td>77093796</td>\n","      <td>990</td>\n","      <td>234</td>\n","      <td>4.139342e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Indian River County</th>\n","      <td>16.033333</td>\n","      <td>49.9</td>\n","      <td>10.86</td>\n","      <td>8.11</td>\n","      <td>51797</td>\n","      <td>10.6</td>\n","      <td>16249</td>\n","      <td>157413</td>\n","      <td>5001702</td>\n","      <td>79</td>\n","      <td>23</td>\n","      <td>5.417251e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Lafayette County</th>\n","      <td>11.066667</td>\n","      <td>20.0</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>41549</td>\n","      <td>22.7</td>\n","      <td>1579</td>\n","      <td>8732</td>\n","      <td>160614</td>\n","      <td>2</td>\n","      <td>1</td>\n","      <td>4.387254e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Lake County</th>\n","      <td>13.760000</td>\n","      <td>36.9</td>\n","      <td>10.51</td>\n","      <td>8.05</td>\n","      <td>51429</td>\n","      <td>12.6</td>\n","      <td>43020</td>\n","      <td>356495</td>\n","      <td>7452383</td>\n","      <td>151</td>\n","      <td>47</td>\n","      <td>1.418739e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Lee County</th>\n","      <td>15.206667</td>\n","      <td>49.9</td>\n","      <td>6.08</td>\n","      <td>1.89</td>\n","      <td>54198</td>\n","      <td>11.8</td>\n","      <td>85844</td>\n","      <td>754610</td>\n","      <td>23806704</td>\n","      <td>380</td>\n","      <td>63</td>\n","      <td>6.269670e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Levy County</th>\n","      <td>13.393333</td>\n","      <td>35.9</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>37272</td>\n","      <td>20.8</td>\n","      <td>8329</td>\n","      <td>40770</td>\n","      <td>636701</td>\n","      <td>7</td>\n","      <td>2</td>\n","      <td>1.823098e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Manatee County</th>\n","      <td>15.620000</td>\n","      <td>42.0</td>\n","      <td>7.08</td>\n","      <td>5.71</td>\n","      <td>55189</td>\n","      <td>10.8</td>\n","      <td>41057</td>\n","      <td>394855</td>\n","      <td>11968028</td>\n","      <td>170</td>\n","      <td>33</td>\n","      <td>1.369299e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Marion County</th>\n","      <td>11.150000</td>\n","      <td>32.1</td>\n","      <td>0.44</td>\n","      <td>0.23</td>\n","      <td>43772</td>\n","      <td>16.2</td>\n","      <td>55880</td>\n","      <td>359977</td>\n","      <td>7956019</td>\n","      <td>121</td>\n","      <td>30</td>\n","      <td>1.934269e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Martin County</th>\n","      <td>12.593333</td>\n","      <td>22.0</td>\n","      <td>16.32</td>\n","      <td>9.43</td>\n","      <td>58344</td>\n","      <td>10.9</td>\n","      <td>17002</td>\n","      <td>160912</td>\n","      <td>6533103</td>\n","      <td>85</td>\n","      <td>22</td>\n","      <td>2.936391e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Miami-Dade County</th>\n","      <td>15.900000</td>\n","      <td>42.9</td>\n","      <td>9.33</td>\n","      <td>5.18</td>\n","      <td>49758</td>\n","      <td>16.7</td>\n","      <td>452649</td>\n","      <td>2761581</td>\n","      <td>141734334</td>\n","      <td>4063</td>\n","      <td>730</td>\n","      <td>2.417464e+08</td>\n","    </tr>\n","    <tr>\n","      <th>Monroe County</th>\n","      <td>15.900000</td>\n","      <td>42.9</td>\n","      <td>9.33</td>\n","      <td>5.18</td>\n","      <td>63009</td>\n","      <td>11.8</td>\n","      <td>8963</td>\n","      <td>75027</td>\n","      <td>4097511</td>\n","      <td>158</td>\n","      <td>31</td>\n","      <td>1.061900e+08</td>\n","    </tr>\n","    <tr>\n","      <th>Nassau County</th>\n","      <td>14.566667</td>\n","      <td>42.9</td>\n","      <td>10.05</td>\n","      <td>9.01</td>\n","      <td>70590</td>\n","      <td>9.1</td>\n","      <td>7484</td>\n","      <td>85832</td>\n","      <td>1886261</td>\n","      <td>49</td>\n","      <td>16</td>\n","      <td>4.184525e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Okeechobee County</th>\n","      <td>14.053333</td>\n","      <td>40.0</td>\n","      <td>2.63</td>\n","      <td>1.36</td>\n","      <td>42524</td>\n","      <td>21.8</td>\n","      <td>8415</td>\n","      <td>41537</td>\n","      <td>1021477</td>\n","      <td>7</td>\n","      <td>2</td>\n","      <td>2.920370e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Orange County</th>\n","      <td>16.326667</td>\n","      <td>44.1</td>\n","      <td>10.36</td>\n","      <td>7.48</td>\n","      <td>54021</td>\n","      <td>15.3</td>\n","      <td>201528</td>\n","      <td>1380645</td>\n","      <td>89817807</td>\n","      <td>1794</td>\n","      <td>346</td>\n","      <td>5.216380e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Osceola County</th>\n","      <td>19.046667</td>\n","      <td>49.9</td>\n","      <td>13.43</td>\n","      <td>7.02</td>\n","      <td>49284</td>\n","      <td>14.0</td>\n","      <td>48892</td>\n","      <td>367990</td>\n","      <td>9207981</td>\n","      <td>205</td>\n","      <td>50</td>\n","      <td>1.244058e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Palm Beach County</th>\n","      <td>18.200000</td>\n","      <td>51.1</td>\n","      <td>3.96</td>\n","      <td>1.92</td>\n","      <td>60059</td>\n","      <td>11.8</td>\n","      <td>170868</td>\n","      <td>1485941</td>\n","      <td>76866505</td>\n","      <td>904</td>\n","      <td>167</td>\n","      <td>8.307278e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Pasco County</th>\n","      <td>14.313333</td>\n","      <td>33.0</td>\n","      <td>13.18</td>\n","      <td>5.97</td>\n","      <td>51247</td>\n","      <td>13.0</td>\n","      <td>67635</td>\n","      <td>539630</td>\n","      <td>9330553</td>\n","      <td>206</td>\n","      <td>58</td>\n","      <td>1.824896e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Pinellas County</th>\n","      <td>15.446667</td>\n","      <td>42.9</td>\n","      <td>5.85</td>\n","      <td>4.32</td>\n","      <td>51488</td>\n","      <td>12.2</td>\n","      <td>115990</td>\n","      <td>975280</td>\n","      <td>44125945</td>\n","      <td>730</td>\n","      <td>159</td>\n","      <td>5.632790e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Polk County</th>\n","      <td>17.544444</td>\n","      <td>36.9</td>\n","      <td>8.06</td>\n","      <td>6.21</td>\n","      <td>48328</td>\n","      <td>16.1</td>\n","      <td>107844</td>\n","      <td>708009</td>\n","      <td>20779632</td>\n","      <td>524</td>\n","      <td>116</td>\n","      <td>4.351088e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Putnam County</th>\n","      <td>16.980000</td>\n","      <td>45.1</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>34390</td>\n","      <td>26.3</td>\n","      <td>18954</td>\n","      <td>74163</td>\n","      <td>1925314</td>\n","      <td>16</td>\n","      <td>4</td>\n","      <td>1.110617e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Sarasota County</th>\n","      <td>15.620000</td>\n","      <td>45.1</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>77022</td>\n","      <td>8.3</td>\n","      <td>20118</td>\n","      <td>254261</td>\n","      <td>7313073</td>\n","      <td>243</td>\n","      <td>68</td>\n","      <td>1.306549e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Seminole County</th>\n","      <td>16.740000</td>\n","      <td>49.9</td>\n","      <td>16.32</td>\n","      <td>9.43</td>\n","      <td>49995</td>\n","      <td>12.8</td>\n","      <td>39839</td>\n","      <td>321128</td>\n","      <td>7030683</td>\n","      <td>331</td>\n","      <td>81</td>\n","      <td>2.173058e+07</td>\n","    </tr>\n","    <tr>\n","      <th>St. Johns County</th>\n","      <td>16.980000</td>\n","      <td>42.0</td>\n","      <td>7.08</td>\n","      <td>5.71</td>\n","      <td>58423</td>\n","      <td>9.2</td>\n","      <td>38065</td>\n","      <td>426718</td>\n","      <td>15773229</td>\n","      <td>154</td>\n","      <td>34</td>\n","      <td>1.862228e+07</td>\n","    </tr>\n","    <tr>\n","      <th>St. Lucie County</th>\n","      <td>16.033333</td>\n","      <td>44.1</td>\n","      <td>11.28</td>\n","      <td>9.24</td>\n","      <td>63865</td>\n","      <td>11.2</td>\n","      <td>51321</td>\n","      <td>467832</td>\n","      <td>17902542</td>\n","      <td>112</td>\n","      <td>32</td>\n","      <td>2.350643e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Sumter County</th>\n","      <td>13.760000</td>\n","      <td>36.9</td>\n","      <td>10.51</td>\n","      <td>8.05</td>\n","      <td>57931</td>\n","      <td>9.1</td>\n","      <td>10672</td>\n","      <td>128754</td>\n","      <td>2312425</td>\n","      <td>21</td>\n","      <td>7</td>\n","      <td>3.424526e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Suwannee County</th>\n","      <td>11.066667</td>\n","      <td>20.0</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>44144</td>\n","      <td>20.3</td>\n","      <td>8299</td>\n","      <td>44191</td>\n","      <td>851332</td>\n","      <td>12</td>\n","      <td>4</td>\n","      <td>2.045323e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Union County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>47373</td>\n","      <td>22.2</td>\n","      <td>2291</td>\n","      <td>14940</td>\n","      <td>341763</td>\n","      <td>3</td>\n","      <td>3</td>\n","      <td>5.774985e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Volusia County</th>\n","      <td>16.333333</td>\n","      <td>35.0</td>\n","      <td>8.66</td>\n","      <td>4.66</td>\n","      <td>46911</td>\n","      <td>15.2</td>\n","      <td>79877</td>\n","      <td>547538</td>\n","      <td>14864234</td>\n","      <td>266</td>\n","      <td>51</td>\n","      <td>3.782572e+07</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                     maxwind_mean  maxwind_peak  ...  twt_peak           dmg\n","county                                           ...                        \n","Alachua County          12.453333          35.0  ...        39  9.306765e+06\n","Baker County            10.200000          24.1  ...         1  1.360526e+06\n","Bradford County         12.453333          35.0  ...         1  3.246248e+06\n","Brevard County          14.900000          41.0  ...        71  3.192891e+07\n","Broward County          15.680000          49.0  ...       382  1.329551e+08\n","Charlotte County        17.666667          44.1  ...        20  6.625988e+06\n","Citrus County           14.313333          33.0  ...        13  5.568339e+06\n","Clay County             10.200000          24.1  ...         8  1.219590e+07\n","Collier County          18.866667          63.9  ...        63  6.194133e+07\n","Columbia County         12.453333          35.0  ...         4  3.225476e+06\n","DeSoto County           17.666667          44.1  ...         2  4.049290e+06\n","Dixie County            13.393333          35.9  ...         5  9.079057e+05\n","Duval County            15.626667          42.9  ...       137  4.740938e+07\n","Flagler County          15.920000          44.1  ...        16  7.243310e+06\n","Gilchrist County        13.393333          35.9  ...         1  6.418368e+05\n","Glades County           14.053333          40.0  ...         2  1.684916e+06\n","Hamilton County         11.066667          20.0  ...         3  7.796505e+05\n","Hardee County           18.146667          54.0  ...         3  6.791781e+06\n","Hendry County           18.866667          63.9  ...         1  4.864095e+06\n","Hernando County         14.313333          33.0  ...        12  6.987439e+06\n","Highlands County        18.146667          54.0  ...        16  1.104181e+07\n","Hillsborough County     10.140000          27.0  ...       234  4.139342e+07\n","Indian River County     16.033333          49.9  ...        23  5.417251e+06\n","Lafayette County        11.066667          20.0  ...         1  4.387254e+05\n","Lake County             13.760000          36.9  ...        47  1.418739e+07\n","Lee County              15.206667          49.9  ...        63  6.269670e+07\n","Levy County             13.393333          35.9  ...         2  1.823098e+06\n","Manatee County          15.620000          42.0  ...        33  1.369299e+07\n","Marion County           11.150000          32.1  ...        30  1.934269e+07\n","Martin County           12.593333          22.0  ...        22  2.936391e+06\n","Miami-Dade County       15.900000          42.9  ...       730  2.417464e+08\n","Monroe County           15.900000          42.9  ...        31  1.061900e+08\n","Nassau County           14.566667          42.9  ...        16  4.184525e+06\n","Okeechobee County       14.053333          40.0  ...         2  2.920370e+06\n","Orange County           16.326667          44.1  ...       346  5.216380e+07\n","Osceola County          19.046667          49.9  ...        50  1.244058e+07\n","Palm Beach County       18.200000          51.1  ...       167  8.307278e+07\n","Pasco County            14.313333          33.0  ...        58  1.824896e+07\n","Pinellas County         15.446667          42.9  ...       159  5.632790e+07\n","Polk County             17.544444          36.9  ...       116  4.351088e+07\n","Putnam County           16.980000          45.1  ...         4  1.110617e+07\n","Sarasota County         15.620000          45.1  ...        68  1.306549e+07\n","Seminole County         16.740000          49.9  ...        81  2.173058e+07\n","St. Johns County        16.980000          42.0  ...        34  1.862228e+07\n","St. Lucie County        16.033333          44.1  ...        32  2.350643e+07\n","Sumter County           13.760000          36.9  ...         7  3.424526e+06\n","Suwannee County         11.066667          20.0  ...         4  2.045323e+06\n","Union County            12.453333          35.0  ...         3  5.774985e+05\n","Volusia County          16.333333          35.0  ...        51  3.782572e+07\n","\n","[49 rows x 12 columns]"]},"metadata":{"tags":[]},"execution_count":1}]},{"cell_type":"markdown","metadata":{"id":"Fg-Ticrhdpih"},"source":["## Feature Selection\n","\n","We have 11 features, but do we really need them all? Especially when we only have 49 data points. As the following heat map shows, a lot of the variables correlate with each other, forming distinct clusters."]},{"cell_type":"code","metadata":{"id":"B-5M2buOeaQV","executionInfo":{"status":"ok","timestamp":1603605567553,"user_tz":240,"elapsed":1990,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"03ab3528-f9cc-4e71-f9f3-7c6e0c94d69c","colab":{"base_uri":"https://localhost:8080/","height":512}},"source":["plt.figure(figsize=(12,7))\n","sns.heatmap(df.corr(),\n","            annot=True,\n","            fmt = '.2f',\n","            cmap='coolwarm')\n","plt.show()"],"execution_count":2,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 864x504 with 2 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"markdown","metadata":{"id":"SWRPvSoZQ341"},"source":["We can identify these clusters as follows:\n","\n","- Wind: `maxwind_mean`, `maxwind_peak`\n","- Precipitation: `precip_total`, `precip_peak`\n","- Socioeconomic: `mhi`, `poverty_rate`\n","- Population: `poverty_pop`, `population`, `gdp`, `twt_total`, `twt_peak`\n","\n","Our goal is to predict `dmg`. To avoid high multicollinearity among our predictors, let's simply take one variable from each cluster. We'll pick the ones with the strongest correlation to `dmg`. This gives us `maxwind_peak`, `precip_peak`, `mhi`, `twt_total`.\n","\n","Let's verify that these variables are independent of eacher using **Variance Inflation Factor (VIF)**. The VIF of an independent variable is a measure of how much its variation can be attributed to other independent variables. The higher it is, the more redundant the (not so) independent variable is. VIF is calculated as follows:\n","\n","$$ VIF = \\frac{1}{1 - R^2_i} $$\n","\n","where $R_i^2$ is the $R^2$ of the multilinear regression model of the $i^{\\text{th}}$ independent variable using the other independent variables as predictors. Why not just use $R^2_i$ to measure redundancy? One intuition is that the inverse relationship will harshly penalize smaller and smaller values of $1 - R^2_i$ (the so-called **tolerance**).\n","\n","Let's compute the VIF for each of the four predictors we're using:"]},{"cell_type":"code","metadata":{"id":"nnUg2sP6-DIk","executionInfo":{"status":"ok","timestamp":1603605568034,"user_tz":240,"elapsed":2453,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"e055b869-8298-4a60-e0fd-5f89f120ed63","colab":{"base_uri":"https://localhost:8080/","height":144}},"source":["from sklearn.preprocessing import robust_scale\n","from statsmodels.stats.outliers_influence import variance_inflation_factor as vif\n","import statsmodels.api as sm\n","\n","# preprocess data\n","features = ['maxwind_peak', 'precip_peak', 'mhi', 'twt_total']\n","X = df[features].apply(robust_scale)\n","X_vif = sm.add_constant(X.values) # for the vif func\n","\n","vif_dict = {}\n","for f in features:\n","    idx = features.index(f) # need this for vif func\n","    vif_dict[f] = vif(X_vif, idx)\n","vif_dict"],"execution_count":3,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n","  import pandas.util.testing as tm\n"],"name":"stderr"},{"output_type":"execute_result","data":{"text/plain":["{'maxwind_peak': 1.1609234512973168,\n"," 'mhi': 1.0514008951220526,\n"," 'precip_peak': 1.0447003684876797,\n"," 'twt_total': 1.097731476287398}"]},"metadata":{"tags":[]},"execution_count":3}]},{"cell_type":"markdown","metadata":{"id":"jFmHET7jZGLm"},"source":["This is good! The typical threshhold for VIF's is 10, sometimes 5. All the VIF's calculated were close to 1, indicating that the variables are very much independent of each other.\n","\n","## Modeling\n","Now that we have settled on our features, let's model our data. We will try out a linear regression and decision tree model and see which comes out best."]},{"cell_type":"code","metadata":{"id":"XHNL9NDLGKy6","executionInfo":{"status":"ok","timestamp":1603605568039,"user_tz":240,"elapsed":2442,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}}},"source":["from sklearn.model_selection import train_test_split\n","\n","Y = robust_scale(df['dmg'])\n","\n","X_train, X_test, Y_train, Y_test = train_test_split(X, Y,\n","                                                    test_size = 0.1,\n","                                                    random_state = 0,\n","                                                    shuffle = True)"],"execution_count":4,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"SpztfD5hGLd1"},"source":["### Linear Regression\n","\n","First, we will run cross validation on linear regressors and find the best performing model."]},{"cell_type":"code","metadata":{"id":"0ONf_rgoZhOS","executionInfo":{"status":"ok","timestamp":1603605568041,"user_tz":240,"elapsed":2429,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"3121c275-e795-4bef-d9e8-792b1a27ddd9","colab":{"base_uri":"https://localhost:8080/","height":195}},"source":["from sklearn.model_selection import cross_validate, KFold\n","from sklearn.linear_model import LinearRegression\n","\n","reg = LinearRegression()\n","kf = KFold(5)\n","cv_scores = cross_validate(reg,\n","                           X_train, Y_train,\n","                           cv = kf,\n","                           scoring = 'r2',\n","                           return_train_score = True,\n","                           return_estimator = True)\n","\n","best_score = np.max(cv_scores['test_score'])\n","best_idx = np.where(cv_scores['test_score'] == best_score)[0][0] # need the [0][0] to get the idx\n","linreg_model = cv_scores['estimator'][best_idx]\n","print('Best CV r2: %.2f' % best_score)\n","cv_scores"],"execution_count":5,"outputs":[{"output_type":"stream","text":["Best CV r2: 0.81\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["{'estimator': (LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),\n","  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),\n","  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),\n","  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),\n","  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)),\n"," 'fit_time': array([0.00223398, 0.00137377, 0.00139618, 0.00130749, 0.00127864]),\n"," 'score_time': array([0.00135279, 0.00084329, 0.00083733, 0.00079083, 0.00079751]),\n"," 'test_score': array([-0.17218448, -1.99457306,  0.57294813,  0.81115215,  0.6499881 ]),\n"," 'train_score': array([0.80465442, 0.62029971, 0.54761734, 0.42716121, 0.54831034])}"]},"metadata":{"tags":[]},"execution_count":5}]},{"cell_type":"markdown","metadata":{"id":"J4WYVukfMbh2"},"source":["### Decision Tree\n","Now let's find the optimal decision tree model."]},{"cell_type":"code","metadata":{"id":"sS8der1cMoYP","executionInfo":{"status":"ok","timestamp":1603605568050,"user_tz":240,"elapsed":2421,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"2254943f-f274-4d1b-efdc-6a11aa0ac136","colab":{"base_uri":"https://localhost:8080/","height":639}},"source":["from sklearn.tree import DecisionTreeRegressor\n","\n","tree = DecisionTreeRegressor(random_state = 0)\n","\n","cv_scores = cross_validate(tree,\n","                           X_train, Y_train,\n","                           cv = kf,\n","                           scoring = 'r2',\n","                           return_train_score = True,\n","                           return_estimator = True)\n","\n","best_score = np.max(cv_scores['test_score'])\n","best_idx = np.where(cv_scores['test_score'] == best_score)[0][0]\n","dt_model = cv_scores['estimator'][best_idx]\n","print(\"Best CV r2: %.2f\" % best_score)\n","cv_scores"],"execution_count":6,"outputs":[{"output_type":"stream","text":["Best CV r2: 0.82\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["{'estimator': (DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,\n","                        max_features=None, max_leaf_nodes=None,\n","                        min_impurity_decrease=0.0, min_impurity_split=None,\n","                        min_samples_leaf=1, min_samples_split=2,\n","                        min_weight_fraction_leaf=0.0, presort='deprecated',\n","                        random_state=0, splitter='best'),\n","  DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,\n","                        max_features=None, max_leaf_nodes=None,\n","                        min_impurity_decrease=0.0, min_impurity_split=None,\n","                        min_samples_leaf=1, min_samples_split=2,\n","                        min_weight_fraction_leaf=0.0, presort='deprecated',\n","                        random_state=0, splitter='best'),\n","  DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,\n","                        max_features=None, max_leaf_nodes=None,\n","                        min_impurity_decrease=0.0, min_impurity_split=None,\n","                        min_samples_leaf=1, min_samples_split=2,\n","                        min_weight_fraction_leaf=0.0, presort='deprecated',\n","                        random_state=0, splitter='best'),\n","  DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,\n","                        max_features=None, max_leaf_nodes=None,\n","                        min_impurity_decrease=0.0, min_impurity_split=None,\n","                        min_samples_leaf=1, min_samples_split=2,\n","                        min_weight_fraction_leaf=0.0, presort='deprecated',\n","                        random_state=0, splitter='best'),\n","  DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,\n","                        max_features=None, max_leaf_nodes=None,\n","                        min_impurity_decrease=0.0, min_impurity_split=None,\n","                        min_samples_leaf=1, min_samples_split=2,\n","                        min_weight_fraction_leaf=0.0, presort='deprecated',\n","                        random_state=0, splitter='best')),\n"," 'fit_time': array([0.00195765, 0.00157475, 0.00157881, 0.00152278, 0.00159955]),\n"," 'score_time': array([0.00102186, 0.00093675, 0.00092006, 0.00089002, 0.00084209]),\n"," 'test_score': array([ 0.25140641, -1.84462954,  0.17624502,  0.82366205,  0.04245883]),\n"," 'train_score': array([1., 1., 1., 1., 1.])}"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"g0Jhv1mSQ4gA"},"source":["The decision tree appears to be grossly overfitting, as the last two rows show. The train $R^2$ values are all 1! Let's see how these two models compare on the test set..."]},{"cell_type":"code","metadata":{"id":"cLZDH08SRgyS","executionInfo":{"status":"ok","timestamp":1603605568054,"user_tz":240,"elapsed":2409,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"1af108b9-1275-4073-d717-065b8ab303fa","colab":{"base_uri":"https://localhost:8080/","height":141}},"source":["linreg_train_score = linreg_model.score(X_train, Y_train)\n","linreg_test_score = linreg_model.score(X_test, Y_test)\n","tree_train_score = dt_model.score(X_train, Y_train)\n","tree_test_score = dt_model.score(X_test, Y_test)\n","\n","print(\"===== Linear Regression r2 Scores =====\")\n","print(\"Train: %.2f\" % linreg_train_score)\n","print(\"Test: %.2f\" % linreg_test_score)\n","print()\n","print(\"===== Decision Tree Regressor r2 Scores =====\")\n","print(\"Train: %.2f\" % tree_train_score)\n","print(\"Test: %.2f\" % tree_test_score)"],"execution_count":7,"outputs":[{"output_type":"stream","text":["===== Linear Regression r2 Scores =====\n","Train: 0.55\n","Test: 0.65\n","\n","===== Decision Tree Regressor r2 Scores =====\n","Train: 0.95\n","Test: 0.50\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"LRrQZPrrTY-j"},"source":["Both models overfit, although the linear regressor less so than the decision tree. The test scores are also modest. What do we do now?\n","\n","Ideas:\n","\n","- Use regularization for linear regression: ridge\n","- Look at the parameters of the dt. Is there anything that looks like it should be tweaked to prevent overfitting? Maybe eg the depth is too large.\n","- Can we use another model? Random forest regression seems like a popular alternative. It will eliminate some of the explainability form DT regression, but should alleviate some of the overfitting."]}]}
\ No newline at end of file
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"irma_modeling.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true,"mount_file_id":"1Q4J-Du4O02VX-aMhBgnqA8CcQVFolx1x","authorship_tag":"ABX9TyO58oGmbwmCT3JpyThA0NLp"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"dpQIA2trCnVq"},"source":["# Hurricane Irma Damage Assessment Modeling\n","\n","In this notebook we develop a multilinear and decision tree model to assess damages from hurricane Irma.\n","\n","Our dataset consists of weather, socioeconomic, and Twitter parameters from 49 counties in Florida during Irma. The dependent variable is the amount of federal aid from FEMA that a county received, `dmg`."]},{"cell_type":"code","metadata":{"id":"ShFs31gOCjKO","executionInfo":{"status":"ok","timestamp":1603699821465,"user_tz":240,"elapsed":677,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"eb454c3e-526e-4920-f137-e38c4544dce1","colab":{"base_uri":"https://localhost:8080/","height":1000}},"source":["%cd '/content/drive/My Drive/Colab Notebooks/disaster_assessment/irma_modeling'\n","\n","import pandas as pd\n","import numpy as np\n","from matplotlib import pyplot as plt\n","import seaborn as sns; sns.set()\n","\n","df = pd.read_csv('irma.csv')\n","df.set_index(keys='county',inplace=True)\n","df"],"execution_count":1,"outputs":[{"output_type":"stream","text":["/content/drive/My Drive/Colab Notebooks/disaster_assessment/irma_modeling\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>maxwind_mean</th>\n","      <th>maxwind_peak</th>\n","      <th>precip_total</th>\n","      <th>precip_peak</th>\n","      <th>mhi</th>\n","      <th>poverty_rate</th>\n","      <th>poverty_pop</th>\n","      <th>population</th>\n","      <th>gdp</th>\n","      <th>twt_total</th>\n","      <th>twt_peak</th>\n","      <th>dmg</th>\n","    </tr>\n","    <tr>\n","      <th>county</th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>Alachua County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>45230</td>\n","      <td>21.2</td>\n","      <td>53816</td>\n","      <td>269956</td>\n","      <td>11912080</td>\n","      <td>182</td>\n","      <td>39</td>\n","      <td>9.306765e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Baker County</th>\n","      <td>10.200000</td>\n","      <td>24.1</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>51856</td>\n","      <td>15.3</td>\n","      <td>3900</td>\n","      <td>28355</td>\n","      <td>472948</td>\n","      <td>1</td>\n","      <td>1</td>\n","      <td>1.360526e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Bradford County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>44997</td>\n","      <td>19.1</td>\n","      <td>4533</td>\n","      <td>27732</td>\n","      <td>532487</td>\n","      <td>4</td>\n","      <td>1</td>\n","      <td>3.246248e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Brevard County</th>\n","      <td>14.900000</td>\n","      <td>41.0</td>\n","      <td>4.69</td>\n","      <td>2.25</td>\n","      <td>52596</td>\n","      <td>12.4</td>\n","      <td>72303</td>\n","      <td>596849</td>\n","      <td>20453753</td>\n","      <td>354</td>\n","      <td>71</td>\n","      <td>3.192891e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Broward County</th>\n","      <td>15.680000</td>\n","      <td>49.0</td>\n","      <td>10.44</td>\n","      <td>4.11</td>\n","      <td>56702</td>\n","      <td>13.1</td>\n","      <td>252288</td>\n","      <td>1951260</td>\n","      <td>96591919</td>\n","      <td>2036</td>\n","      <td>382</td>\n","      <td>1.329551e+08</td>\n","    </tr>\n","    <tr>\n","      <th>Charlotte County</th>\n","      <td>17.666667</td>\n","      <td>44.1</td>\n","      <td>1.69</td>\n","      <td>0.76</td>\n","      <td>51583</td>\n","      <td>10.8</td>\n","      <td>19300</td>\n","      <td>184998</td>\n","      <td>3966314</td>\n","      <td>72</td>\n","      <td>20</td>\n","      <td>6.625988e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Citrus County</th>\n","      <td>14.313333</td>\n","      <td>33.0</td>\n","      <td>13.18</td>\n","      <td>5.97</td>\n","      <td>43147</td>\n","      <td>14.4</td>\n","      <td>20654</td>\n","      <td>147929</td>\n","      <td>3335110</td>\n","      <td>67</td>\n","      <td>13</td>\n","      <td>5.568339e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Clay County</th>\n","      <td>10.200000</td>\n","      <td>24.1</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>65375</td>\n","      <td>9.9</td>\n","      <td>20889</td>\n","      <td>216072</td>\n","      <td>3976007</td>\n","      <td>53</td>\n","      <td>8</td>\n","      <td>1.219590e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Collier County</th>\n","      <td>18.866667</td>\n","      <td>63.9</td>\n","      <td>5.12</td>\n","      <td>4.20</td>\n","      <td>66709</td>\n","      <td>11.7</td>\n","      <td>43075</td>\n","      <td>378488</td>\n","      <td>16124953</td>\n","      <td>231</td>\n","      <td>63</td>\n","      <td>6.194133e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Columbia County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>42097</td>\n","      <td>16.5</td>\n","      <td>10720</td>\n","      <td>70503</td>\n","      <td>1826541</td>\n","      <td>14</td>\n","      <td>4</td>\n","      <td>3.225476e+06</td>\n","    </tr>\n","    <tr>\n","      <th>DeSoto County</th>\n","      <td>17.666667</td>\n","      <td>44.1</td>\n","      <td>1.69</td>\n","      <td>0.76</td>\n","      <td>37342</td>\n","      <td>26.1</td>\n","      <td>8766</td>\n","      <td>37489</td>\n","      <td>735286</td>\n","      <td>5</td>\n","      <td>2</td>\n","      <td>4.049290e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Dixie County</th>\n","      <td>13.393333</td>\n","      <td>35.9</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>38355</td>\n","      <td>24.4</td>\n","      <td>3627</td>\n","      <td>16700</td>\n","      <td>178261</td>\n","      <td>7</td>\n","      <td>5</td>\n","      <td>9.079057e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Duval County</th>\n","      <td>15.626667</td>\n","      <td>42.9</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>52105</td>\n","      <td>15.1</td>\n","      <td>138069</td>\n","      <td>950181</td>\n","      <td>60146765</td>\n","      <td>772</td>\n","      <td>137</td>\n","      <td>4.740938e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Flagler County</th>\n","      <td>15.920000</td>\n","      <td>44.1</td>\n","      <td>8.66</td>\n","      <td>4.66</td>\n","      <td>52713</td>\n","      <td>12.0</td>\n","      <td>13137</td>\n","      <td>112067</td>\n","      <td>1809151</td>\n","      <td>63</td>\n","      <td>16</td>\n","      <td>7.243310e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Gilchrist County</th>\n","      <td>13.393333</td>\n","      <td>35.9</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>42880</td>\n","      <td>16.1</td>\n","      <td>2675</td>\n","      <td>18256</td>\n","      <td>254260</td>\n","      <td>3</td>\n","      <td>1</td>\n","      <td>6.418368e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Glades County</th>\n","      <td>14.053333</td>\n","      <td>40.0</td>\n","      <td>9.96</td>\n","      <td>7.09</td>\n","      <td>42865</td>\n","      <td>18.9</td>\n","      <td>2312</td>\n","      <td>13724</td>\n","      <td>171573</td>\n","      <td>4</td>\n","      <td>2</td>\n","      <td>1.684916e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Hamilton County</th>\n","      <td>11.066667</td>\n","      <td>20.0</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>35262</td>\n","      <td>24.0</td>\n","      <td>2791</td>\n","      <td>14310</td>\n","      <td>377309</td>\n","      <td>5</td>\n","      <td>3</td>\n","      <td>7.796505e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Hardee County</th>\n","      <td>18.146667</td>\n","      <td>54.0</td>\n","      <td>2.63</td>\n","      <td>1.36</td>\n","      <td>40056</td>\n","      <td>23.3</td>\n","      <td>6026</td>\n","      <td>27245</td>\n","      <td>893349</td>\n","      <td>8</td>\n","      <td>3</td>\n","      <td>6.791781e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Hendry County</th>\n","      <td>18.866667</td>\n","      <td>63.9</td>\n","      <td>12.61</td>\n","      <td>8.84</td>\n","      <td>38361</td>\n","      <td>23.9</td>\n","      <td>9525</td>\n","      <td>41556</td>\n","      <td>1241872</td>\n","      <td>2</td>\n","      <td>1</td>\n","      <td>4.864095e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Hernando County</th>\n","      <td>14.313333</td>\n","      <td>33.0</td>\n","      <td>13.18</td>\n","      <td>5.97</td>\n","      <td>44710</td>\n","      <td>14.0</td>\n","      <td>25773</td>\n","      <td>190865</td>\n","      <td>3031267</td>\n","      <td>43</td>\n","      <td>12</td>\n","      <td>6.987439e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Highlands County</th>\n","      <td>18.146667</td>\n","      <td>54.0</td>\n","      <td>2.63</td>\n","      <td>1.36</td>\n","      <td>37445</td>\n","      <td>19.8</td>\n","      <td>20051</td>\n","      <td>105424</td>\n","      <td>2088782</td>\n","      <td>46</td>\n","      <td>16</td>\n","      <td>1.104181e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Hillsborough County</th>\n","      <td>10.140000</td>\n","      <td>27.0</td>\n","      <td>5.43</td>\n","      <td>4.50</td>\n","      <td>54741</td>\n","      <td>15.5</td>\n","      <td>214442</td>\n","      <td>1436888</td>\n","      <td>77093796</td>\n","      <td>990</td>\n","      <td>234</td>\n","      <td>4.139342e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Indian River County</th>\n","      <td>16.033333</td>\n","      <td>49.9</td>\n","      <td>10.86</td>\n","      <td>8.11</td>\n","      <td>51797</td>\n","      <td>10.6</td>\n","      <td>16249</td>\n","      <td>157413</td>\n","      <td>5001702</td>\n","      <td>79</td>\n","      <td>23</td>\n","      <td>5.417251e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Lafayette County</th>\n","      <td>11.066667</td>\n","      <td>20.0</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>41549</td>\n","      <td>22.7</td>\n","      <td>1579</td>\n","      <td>8732</td>\n","      <td>160614</td>\n","      <td>2</td>\n","      <td>1</td>\n","      <td>4.387254e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Lake County</th>\n","      <td>13.760000</td>\n","      <td>36.9</td>\n","      <td>10.51</td>\n","      <td>8.05</td>\n","      <td>51429</td>\n","      <td>12.6</td>\n","      <td>43020</td>\n","      <td>356495</td>\n","      <td>7452383</td>\n","      <td>151</td>\n","      <td>47</td>\n","      <td>1.418739e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Lee County</th>\n","      <td>15.206667</td>\n","      <td>49.9</td>\n","      <td>6.08</td>\n","      <td>1.89</td>\n","      <td>54198</td>\n","      <td>11.8</td>\n","      <td>85844</td>\n","      <td>754610</td>\n","      <td>23806704</td>\n","      <td>380</td>\n","      <td>63</td>\n","      <td>6.269670e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Levy County</th>\n","      <td>13.393333</td>\n","      <td>35.9</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>37272</td>\n","      <td>20.8</td>\n","      <td>8329</td>\n","      <td>40770</td>\n","      <td>636701</td>\n","      <td>7</td>\n","      <td>2</td>\n","      <td>1.823098e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Manatee County</th>\n","      <td>15.620000</td>\n","      <td>42.0</td>\n","      <td>7.08</td>\n","      <td>5.71</td>\n","      <td>55189</td>\n","      <td>10.8</td>\n","      <td>41057</td>\n","      <td>394855</td>\n","      <td>11968028</td>\n","      <td>170</td>\n","      <td>33</td>\n","      <td>1.369299e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Marion County</th>\n","      <td>11.150000</td>\n","      <td>32.1</td>\n","      <td>0.44</td>\n","      <td>0.23</td>\n","      <td>43772</td>\n","      <td>16.2</td>\n","      <td>55880</td>\n","      <td>359977</td>\n","      <td>7956019</td>\n","      <td>121</td>\n","      <td>30</td>\n","      <td>1.934269e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Martin County</th>\n","      <td>12.593333</td>\n","      <td>22.0</td>\n","      <td>16.32</td>\n","      <td>9.43</td>\n","      <td>58344</td>\n","      <td>10.9</td>\n","      <td>17002</td>\n","      <td>160912</td>\n","      <td>6533103</td>\n","      <td>85</td>\n","      <td>22</td>\n","      <td>2.936391e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Miami-Dade County</th>\n","      <td>15.900000</td>\n","      <td>42.9</td>\n","      <td>9.33</td>\n","      <td>5.18</td>\n","      <td>49758</td>\n","      <td>16.7</td>\n","      <td>452649</td>\n","      <td>2761581</td>\n","      <td>141734334</td>\n","      <td>4063</td>\n","      <td>730</td>\n","      <td>2.417464e+08</td>\n","    </tr>\n","    <tr>\n","      <th>Monroe County</th>\n","      <td>15.900000</td>\n","      <td>42.9</td>\n","      <td>9.33</td>\n","      <td>5.18</td>\n","      <td>63009</td>\n","      <td>11.8</td>\n","      <td>8963</td>\n","      <td>75027</td>\n","      <td>4097511</td>\n","      <td>158</td>\n","      <td>31</td>\n","      <td>1.061900e+08</td>\n","    </tr>\n","    <tr>\n","      <th>Nassau County</th>\n","      <td>14.566667</td>\n","      <td>42.9</td>\n","      <td>10.05</td>\n","      <td>9.01</td>\n","      <td>70590</td>\n","      <td>9.1</td>\n","      <td>7484</td>\n","      <td>85832</td>\n","      <td>1886261</td>\n","      <td>49</td>\n","      <td>16</td>\n","      <td>4.184525e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Okeechobee County</th>\n","      <td>14.053333</td>\n","      <td>40.0</td>\n","      <td>2.63</td>\n","      <td>1.36</td>\n","      <td>42524</td>\n","      <td>21.8</td>\n","      <td>8415</td>\n","      <td>41537</td>\n","      <td>1021477</td>\n","      <td>7</td>\n","      <td>2</td>\n","      <td>2.920370e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Orange County</th>\n","      <td>16.326667</td>\n","      <td>44.1</td>\n","      <td>10.36</td>\n","      <td>7.48</td>\n","      <td>54021</td>\n","      <td>15.3</td>\n","      <td>201528</td>\n","      <td>1380645</td>\n","      <td>89817807</td>\n","      <td>1794</td>\n","      <td>346</td>\n","      <td>5.216380e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Osceola County</th>\n","      <td>19.046667</td>\n","      <td>49.9</td>\n","      <td>13.43</td>\n","      <td>7.02</td>\n","      <td>49284</td>\n","      <td>14.0</td>\n","      <td>48892</td>\n","      <td>367990</td>\n","      <td>9207981</td>\n","      <td>205</td>\n","      <td>50</td>\n","      <td>1.244058e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Palm Beach County</th>\n","      <td>18.200000</td>\n","      <td>51.1</td>\n","      <td>3.96</td>\n","      <td>1.92</td>\n","      <td>60059</td>\n","      <td>11.8</td>\n","      <td>170868</td>\n","      <td>1485941</td>\n","      <td>76866505</td>\n","      <td>904</td>\n","      <td>167</td>\n","      <td>8.307278e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Pasco County</th>\n","      <td>14.313333</td>\n","      <td>33.0</td>\n","      <td>13.18</td>\n","      <td>5.97</td>\n","      <td>51247</td>\n","      <td>13.0</td>\n","      <td>67635</td>\n","      <td>539630</td>\n","      <td>9330553</td>\n","      <td>206</td>\n","      <td>58</td>\n","      <td>1.824896e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Pinellas County</th>\n","      <td>15.446667</td>\n","      <td>42.9</td>\n","      <td>5.85</td>\n","      <td>4.32</td>\n","      <td>51488</td>\n","      <td>12.2</td>\n","      <td>115990</td>\n","      <td>975280</td>\n","      <td>44125945</td>\n","      <td>730</td>\n","      <td>159</td>\n","      <td>5.632790e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Polk County</th>\n","      <td>17.544444</td>\n","      <td>36.9</td>\n","      <td>8.06</td>\n","      <td>6.21</td>\n","      <td>48328</td>\n","      <td>16.1</td>\n","      <td>107844</td>\n","      <td>708009</td>\n","      <td>20779632</td>\n","      <td>524</td>\n","      <td>116</td>\n","      <td>4.351088e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Putnam County</th>\n","      <td>16.980000</td>\n","      <td>45.1</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>34390</td>\n","      <td>26.3</td>\n","      <td>18954</td>\n","      <td>74163</td>\n","      <td>1925314</td>\n","      <td>16</td>\n","      <td>4</td>\n","      <td>1.110617e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Sarasota County</th>\n","      <td>15.620000</td>\n","      <td>45.1</td>\n","      <td>9.21</td>\n","      <td>7.06</td>\n","      <td>77022</td>\n","      <td>8.3</td>\n","      <td>20118</td>\n","      <td>254261</td>\n","      <td>7313073</td>\n","      <td>243</td>\n","      <td>68</td>\n","      <td>1.306549e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Seminole County</th>\n","      <td>16.740000</td>\n","      <td>49.9</td>\n","      <td>16.32</td>\n","      <td>9.43</td>\n","      <td>49995</td>\n","      <td>12.8</td>\n","      <td>39839</td>\n","      <td>321128</td>\n","      <td>7030683</td>\n","      <td>331</td>\n","      <td>81</td>\n","      <td>2.173058e+07</td>\n","    </tr>\n","    <tr>\n","      <th>St. Johns County</th>\n","      <td>16.980000</td>\n","      <td>42.0</td>\n","      <td>7.08</td>\n","      <td>5.71</td>\n","      <td>58423</td>\n","      <td>9.2</td>\n","      <td>38065</td>\n","      <td>426718</td>\n","      <td>15773229</td>\n","      <td>154</td>\n","      <td>34</td>\n","      <td>1.862228e+07</td>\n","    </tr>\n","    <tr>\n","      <th>St. Lucie County</th>\n","      <td>16.033333</td>\n","      <td>44.1</td>\n","      <td>11.28</td>\n","      <td>9.24</td>\n","      <td>63865</td>\n","      <td>11.2</td>\n","      <td>51321</td>\n","      <td>467832</td>\n","      <td>17902542</td>\n","      <td>112</td>\n","      <td>32</td>\n","      <td>2.350643e+07</td>\n","    </tr>\n","    <tr>\n","      <th>Sumter County</th>\n","      <td>13.760000</td>\n","      <td>36.9</td>\n","      <td>10.51</td>\n","      <td>8.05</td>\n","      <td>57931</td>\n","      <td>9.1</td>\n","      <td>10672</td>\n","      <td>128754</td>\n","      <td>2312425</td>\n","      <td>21</td>\n","      <td>7</td>\n","      <td>3.424526e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Suwannee County</th>\n","      <td>11.066667</td>\n","      <td>20.0</td>\n","      <td>4.39</td>\n","      <td>2.38</td>\n","      <td>44144</td>\n","      <td>20.3</td>\n","      <td>8299</td>\n","      <td>44191</td>\n","      <td>851332</td>\n","      <td>12</td>\n","      <td>4</td>\n","      <td>2.045323e+06</td>\n","    </tr>\n","    <tr>\n","      <th>Union County</th>\n","      <td>12.453333</td>\n","      <td>35.0</td>\n","      <td>18.25</td>\n","      <td>11.39</td>\n","      <td>47373</td>\n","      <td>22.2</td>\n","      <td>2291</td>\n","      <td>14940</td>\n","      <td>341763</td>\n","      <td>3</td>\n","      <td>3</td>\n","      <td>5.774985e+05</td>\n","    </tr>\n","    <tr>\n","      <th>Volusia County</th>\n","      <td>16.333333</td>\n","      <td>35.0</td>\n","      <td>8.66</td>\n","      <td>4.66</td>\n","      <td>46911</td>\n","      <td>15.2</td>\n","      <td>79877</td>\n","      <td>547538</td>\n","      <td>14864234</td>\n","      <td>266</td>\n","      <td>51</td>\n","      <td>3.782572e+07</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                     maxwind_mean  maxwind_peak  ...  twt_peak           dmg\n","county                                           ...                        \n","Alachua County          12.453333          35.0  ...        39  9.306765e+06\n","Baker County            10.200000          24.1  ...         1  1.360526e+06\n","Bradford County         12.453333          35.0  ...         1  3.246248e+06\n","Brevard County          14.900000          41.0  ...        71  3.192891e+07\n","Broward County          15.680000          49.0  ...       382  1.329551e+08\n","Charlotte County        17.666667          44.1  ...        20  6.625988e+06\n","Citrus County           14.313333          33.0  ...        13  5.568339e+06\n","Clay County             10.200000          24.1  ...         8  1.219590e+07\n","Collier County          18.866667          63.9  ...        63  6.194133e+07\n","Columbia County         12.453333          35.0  ...         4  3.225476e+06\n","DeSoto County           17.666667          44.1  ...         2  4.049290e+06\n","Dixie County            13.393333          35.9  ...         5  9.079057e+05\n","Duval County            15.626667          42.9  ...       137  4.740938e+07\n","Flagler County          15.920000          44.1  ...        16  7.243310e+06\n","Gilchrist County        13.393333          35.9  ...         1  6.418368e+05\n","Glades County           14.053333          40.0  ...         2  1.684916e+06\n","Hamilton County         11.066667          20.0  ...         3  7.796505e+05\n","Hardee County           18.146667          54.0  ...         3  6.791781e+06\n","Hendry County           18.866667          63.9  ...         1  4.864095e+06\n","Hernando County         14.313333          33.0  ...        12  6.987439e+06\n","Highlands County        18.146667          54.0  ...        16  1.104181e+07\n","Hillsborough County     10.140000          27.0  ...       234  4.139342e+07\n","Indian River County     16.033333          49.9  ...        23  5.417251e+06\n","Lafayette County        11.066667          20.0  ...         1  4.387254e+05\n","Lake County             13.760000          36.9  ...        47  1.418739e+07\n","Lee County              15.206667          49.9  ...        63  6.269670e+07\n","Levy County             13.393333          35.9  ...         2  1.823098e+06\n","Manatee County          15.620000          42.0  ...        33  1.369299e+07\n","Marion County           11.150000          32.1  ...        30  1.934269e+07\n","Martin County           12.593333          22.0  ...        22  2.936391e+06\n","Miami-Dade County       15.900000          42.9  ...       730  2.417464e+08\n","Monroe County           15.900000          42.9  ...        31  1.061900e+08\n","Nassau County           14.566667          42.9  ...        16  4.184525e+06\n","Okeechobee County       14.053333          40.0  ...         2  2.920370e+06\n","Orange County           16.326667          44.1  ...       346  5.216380e+07\n","Osceola County          19.046667          49.9  ...        50  1.244058e+07\n","Palm Beach County       18.200000          51.1  ...       167  8.307278e+07\n","Pasco County            14.313333          33.0  ...        58  1.824896e+07\n","Pinellas County         15.446667          42.9  ...       159  5.632790e+07\n","Polk County             17.544444          36.9  ...       116  4.351088e+07\n","Putnam County           16.980000          45.1  ...         4  1.110617e+07\n","Sarasota County         15.620000          45.1  ...        68  1.306549e+07\n","Seminole County         16.740000          49.9  ...        81  2.173058e+07\n","St. Johns County        16.980000          42.0  ...        34  1.862228e+07\n","St. Lucie County        16.033333          44.1  ...        32  2.350643e+07\n","Sumter County           13.760000          36.9  ...         7  3.424526e+06\n","Suwannee County         11.066667          20.0  ...         4  2.045323e+06\n","Union County            12.453333          35.0  ...         3  5.774985e+05\n","Volusia County          16.333333          35.0  ...        51  3.782572e+07\n","\n","[49 rows x 12 columns]"]},"metadata":{"tags":[]},"execution_count":1}]},{"cell_type":"markdown","metadata":{"id":"Fg-Ticrhdpih"},"source":["## Feature Selection\n","\n","We have 11 features, but do we really need them all? Especially when we only have 49 data points. As the following heat map shows, a lot of the variables correlate with each other, forming distinct clusters."]},{"cell_type":"code","metadata":{"id":"B-5M2buOeaQV","executionInfo":{"status":"ok","timestamp":1603699822936,"user_tz":240,"elapsed":2122,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"50f1168f-a4f5-4274-e7e2-06b21a75dc74","colab":{"base_uri":"https://localhost:8080/","height":512}},"source":["plt.figure(figsize=(12,7))\n","sns.heatmap(df.corr(),\n","            annot=True,\n","            fmt = '.2f',\n","            cmap='coolwarm')\n","plt.show()"],"execution_count":2,"outputs":[{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 864x504 with 2 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"markdown","metadata":{"id":"SWRPvSoZQ341"},"source":["We can identify these clusters as follows:\n","\n","- Wind: `maxwind_mean`, `maxwind_peak`\n","- Precipitation: `precip_total`, `precip_peak`\n","- Socioeconomic: `mhi`, `poverty_rate`\n","- Population: `poverty_pop`, `population`, `gdp`, `twt_total`, `twt_peak`\n","\n","Our goal is to predict `dmg`. To avoid high multicollinearity among our predictors, let's simply take one variable from each cluster. We'll pick the ones with the strongest correlation to `dmg`. This gives us `maxwind_peak`, `precip_peak`, `mhi`, `twt_total`.\n","\n","Let's verify that these variables are independent of eacher using **Variance Inflation Factor (VIF)**. The VIF of an independent variable is a measure of how much its variation can be attributed to other independent variables. The higher it is, the more redundant the (not so) independent variable is. VIF is calculated as follows:\n","\n","$$ VIF = \\frac{1}{1 - R^2_i} $$\n","\n","where $R_i^2$ is the $R^2$ of the multilinear regression model of the $i^{\\text{th}}$ independent variable using the other independent variables as predictors. Why not just use $R^2_i$ to measure redundancy? One intuition is that the inverse relationship will harshly penalize smaller and smaller values of $1 - R^2_i$ (the so-called **tolerance**).\n","\n","Let's compute the VIF for each of the four predictors we're using:"]},{"cell_type":"code","metadata":{"id":"nnUg2sP6-DIk","executionInfo":{"status":"ok","timestamp":1603699822942,"user_tz":240,"elapsed":2105,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"08b73a92-ccb8-47ac-8066-2147602e4f91","colab":{"base_uri":"https://localhost:8080/","height":144}},"source":["from sklearn.preprocessing import robust_scale\n","from statsmodels.stats.outliers_influence import variance_inflation_factor as vif\n","import statsmodels.api as sm\n","\n","# preprocess data\n","features = ['maxwind_peak', 'precip_peak', 'mhi', 'twt_total']\n","df = df[features+['dmg']]\n","\n","X = df[features].apply(robust_scale)\n","X_vif = sm.add_constant(X.values) # for the vif func\n","\n","vif_dict = {}\n","for f in features:\n","    idx = features.index(f) # need this for vif func\n","    vif_dict[f] = vif(X_vif, idx)\n","vif_dict"],"execution_count":3,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n","  import pandas.util.testing as tm\n"],"name":"stderr"},{"output_type":"execute_result","data":{"text/plain":["{'maxwind_peak': 1.1609234512973168,\n"," 'mhi': 1.0514008951220526,\n"," 'precip_peak': 1.0447003684876797,\n"," 'twt_total': 1.097731476287398}"]},"metadata":{"tags":[]},"execution_count":3}]},{"cell_type":"markdown","metadata":{"id":"jFmHET7jZGLm"},"source":["This is good! The typical threshhold for VIF's is 10, sometimes 5. All the VIF's calculated were close to 1, indicating that the variables are very much independent of each other.\n","\n","## Modeling\n","Now that we have settled on our features, let's model our data. We will try out a linear regression and decision tree model and see which comes out best."]},{"cell_type":"code","metadata":{"id":"XHNL9NDLGKy6","executionInfo":{"status":"ok","timestamp":1603699822946,"user_tz":240,"elapsed":2092,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}}},"source":["from sklearn.model_selection import train_test_split\n","\n","Y = robust_scale(df['dmg'])\n","\n","X_train, X_test, Y_train, Y_test = train_test_split(X, Y,\n","                                                    test_size = 0.1,\n","                                                    random_state = 0,\n","                                                    shuffle = True)"],"execution_count":4,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"SpztfD5hGLd1"},"source":["### Linear Regression\n"]},{"cell_type":"code","metadata":{"id":"0ONf_rgoZhOS","executionInfo":{"status":"ok","timestamp":1603699822949,"user_tz":240,"elapsed":2082,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}}},"source":["from sklearn.linear_model import LinearRegression\n","\n","linreg = LinearRegression()\n","\n","linreg_model = linreg.fit(X_train, Y_train)"],"execution_count":5,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"J4WYVukfMbh2"},"source":["### Decision Tree\n","Now let's find the optimal decision tree model."]},{"cell_type":"code","metadata":{"id":"sS8der1cMoYP","executionInfo":{"status":"ok","timestamp":1603699823902,"user_tz":240,"elapsed":3023,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"e0264778-4298-4af5-82ff-ff84db063a64","colab":{"base_uri":"https://localhost:8080/","height":35}},"source":["from sklearn.model_selection import GridSearchCV\n","from sklearn.tree import DecisionTreeRegressor\n","\n","tree = DecisionTreeRegressor(random_state = 0)\n","param_grid = {'min_impurity_decrease': [1e-2, 1e-1, 0.2, 0.3],\n","              'max_features': [None, 'sqrt'],\n","              'max_depth': [1, 2, 3, 4, 5, 6]}\n","\n","dt_model = GridSearchCV(tree, param_grid, scoring='r2')\n","dt_model.fit(X_train, Y_train)\n","dt_model.best_params_"],"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'max_depth': 2, 'max_features': 'sqrt', 'min_impurity_decrease': 0.01}"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"g0Jhv1mSQ4gA"},"source":["### Model Selection\n","Let's see how the models stack up on their train/test results"]},{"cell_type":"code","metadata":{"id":"yCbkNzvnmocN","executionInfo":{"status":"ok","timestamp":1603699823910,"user_tz":240,"elapsed":3014,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"9d0b8df9-e1dd-4a1d-f992-e4082fc8e907","colab":{"base_uri":"https://localhost:8080/","height":141}},"source":["linreg_train_score = linreg_model.score(X_train, Y_train)\n","linreg_test_score = linreg_model.score(X_test, Y_test)\n","dt_train_score = dt_model.score(X_train, Y_train)\n","dt_test_score = dt_model.score(X_test, Y_test)\n","\n","print(\"===== Linear Regression r2 Scores =====\")\n","print(\"Train: %.2f\" % linreg_train_score)\n","print(\"Test: %.2f\" % linreg_test_score)\n","print()\n","print(\"===== Decision Tree r2 Scores =====\")\n","print(\"Train: %.2f\" % dt_train_score)\n","print(\"Test: %.2f\" % dt_test_score)"],"execution_count":7,"outputs":[{"output_type":"stream","text":["===== Linear Regression r2 Scores =====\n","Train: 0.56\n","Test: 0.79\n","\n","===== Decision Tree r2 Scores =====\n","Train: 0.73\n","Test: 0.18\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"nFw2immG5Opb"},"source":["The linear regression model seems to generalize well, the test score is *higher* than the train score. The decision tree model on the other hand, is severely overfit.\n","\n","Let's plot the predictions from each model against the true values of the dependent variable and see if this gives any insight."]},{"cell_type":"code","metadata":{"id":"nX7t6gdi8a7u","executionInfo":{"status":"ok","timestamp":1603699824139,"user_tz":240,"elapsed":3230,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"1948cbb2-a769-4bc2-b3ba-12cb1589ba9a","colab":{"base_uri":"https://localhost:8080/","height":589}},"source":["from sklearn.metrics import r2_score\n","\n","Y_linreg = linreg_model.predict(X)\n","Y_dt = dt_model.predict(X)\n","\n","\n","Y_arr = np.array([Y, Y_linreg, Y_dt]).T\n","Y_df = pd.DataFrame(data=Y_arr,\n","                    columns = ['Y_true', 'Y_linreg', 'Y_dt'])\n","\n","sns.scatterplot(data = Y_df,\n","                x = 'Y_true',\n","                y = 'Y_linreg')\n","plt.plot(Y, Y, 'k-')\n","\n","plt.figure()\n","sns.scatterplot(data = Y_df,\n","                x = 'Y_true',\n","                y = 'Y_dt')\n","plt.plot(Y, Y, 'k-')\n","\n","print(\"Linear regression r2 = %.2f\" % r2_score(Y, Y_linreg))\n","print(\"Decision Tree r2 = %.2f\" % r2_score(Y, Y_dt))"],"execution_count":8,"outputs":[{"output_type":"stream","text":["Linear regression r2 = 0.76\n","Decision Tree r2 = 0.48\n"],"name":"stdout"},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"markdown","metadata":{"id":"LRrQZPrrTY-j"},"source":["Both models appear to struggle to accurately predict large values of Y (i.e. `dmg`), especially the decision tree model.\n","\n","How good are the models for \"low\" damage? Let's cut off the Y_true past 2 and see how the models compare."]},{"cell_type":"code","metadata":{"id":"w7g4B2RA6Psm","executionInfo":{"status":"ok","timestamp":1603699824836,"user_tz":240,"elapsed":3914,"user":{"displayName":"Angel Umana","photoUrl":"","userId":"02017183028986324110"}},"outputId":"69890709-e70a-4366-9e53-528bd5fc5081","colab":{"base_uri":"https://localhost:8080/","height":589}},"source":["Y_low = Y[Y < 2]\n","X_low = X[Y < 2]\n","\n","Y_low_linreg = linreg_model.predict(X_low)\n","Y_low_dt = dt_model.predict(X_low)\n","\n","\n","Y_low_arr = np.array([Y_low, Y_low_linreg, Y_low_dt]).T\n","Y_low_df = pd.DataFrame(data=Y_low_arr,\n","                    columns = ['Y_low_true', 'Y_low_linreg', 'Y_low_dt'])\n","\n","sns.scatterplot(data = Y_low_df,\n","                x = 'Y_low_true',\n","                y = 'Y_low_linreg')\n","plt.plot(Y_low, Y_low, 'k-')\n","\n","plt.figure()\n","sns.scatterplot(data = Y_low_df,\n","                x = 'Y_low_true',\n","                y = 'Y_low_dt')\n","plt.plot(Y_low, Y_low, 'k-')\n","\n","print(\"Linear regression r2 = %.2f\" % r2_score(Y_low, Y_low_linreg))\n","print(\"Decision Tree r2 = %.2f\" % r2_score(Y_low, Y_low_dt))"],"execution_count":9,"outputs":[{"output_type":"stream","text":["Linear regression r2 = 0.57\n","Decision Tree r2 = 0.70\n"],"name":"stdout"},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"markdown","metadata":{"id":"t66hzINe7h7V"},"source":["Suddenly, the predictive capabilities of the two models flipped! This seems to solidify the idea that the linear regression model was able to adapt to outliers better than the decision tree model. At the same time, the decision tree model can generalize better to lower values of `dmg`. In a context such as ours (natural disaster assessment) it is crucial to account for outliers (counties with a lot of federal aid needed), thus the linear regression model is better suited for future predictions.\n","\n","## Final Remarks\n","\n","Where should we go from here?\n","\n","Ideas:\n","\n","- Investigate feature_importances_ from the decision tree model. While it may not be better suited for generalization, maybe we can learn something from how it weighed the predictors. Depending on what is found, we can try to modify the linear regression model.\n","- Incorporate data from hurricanes Harvey and Michael"]},{"cell_type":"code","metadata":{"id":"wpNUo0XL9wsZ"},"source":[""],"execution_count":null,"outputs":[]}]}
\ No newline at end of file