From 17866a36cea142f1b391812b6f60f2898b6db994 Mon Sep 17 00:00:00 2001 From: ibidyouadu Date: Sun, 25 Oct 2020 02:09:09 -0400 Subject: [PATCH] removed ipynb checkpoints --- .../irma_modeling-checkpoint.ipynb | 1396 ----------------- 1 file changed, 1396 deletions(-) delete mode 100644 .ipynb_checkpoints/irma_modeling-checkpoint.ipynb diff --git a/.ipynb_checkpoints/irma_modeling-checkpoint.ipynb b/.ipynb_checkpoints/irma_modeling-checkpoint.ipynb deleted file mode 100644 index 9e79428..0000000 --- a/.ipynb_checkpoints/irma_modeling-checkpoint.ipynb +++ /dev/null @@ -1,1396 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "dpQIA2trCnVq" - }, - "source": [ - "# Hurricane Irma Damage Assessment Modeling\n", - "\n", - "In this notebook we develop a model to assess damages from hurricane Irma in Florida.\n", - "\n", - "Our dataset consists of weather, socioeconomic, and Twitter parameters from 49 counties in Florida during Irma. The dependent variable is the amount of federal aid from FEMA that a county received, `dmg`." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 695, - "status": "ok", - "timestamp": 1603518985831, - "user": { - "displayName": "Angel Umana", - "photoUrl": "", - "userId": "02017183028986324110" - }, - "user_tz": 240 - }, - "id": "ShFs31gOCjKO", - "outputId": "056be537-9ff9-445a-84b1-a65f73e070e1" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wind_totalwind_peakgust_totalgust_peakmaxwind_totalmaxwind_peakprecip_totalprecip_peakpoverty_poppoverty_ratemhigdppopulationtwt_totaltwt_peakdmg
county
Alachua County87.221.4173.053.0186.835.018.2511.395381621.24523011912080269956182399.306765e+06
Baker County81.018.6152.038.1153.024.19.217.06390015.35185647294828355111.360526e+06
Bradford County87.221.4173.053.0186.835.018.2511.39453319.14499753248727732413.246248e+06
Brevard County130.928.7240.369.9223.541.04.692.257230312.45259620453753596849354713.192891e+07
Broward County128.231.7216.973.0235.249.010.444.1125228813.15670296591919195126020363821.329551e+08
Charlotte County112.124.9305.263.9265.044.11.690.761930010.851583396631418499872206.625988e+06
Citrus County103.328.8186.855.9214.733.013.185.972065414.443147333511014792967135.568339e+06
Clay County81.018.6152.038.1153.024.19.217.06208899.96537539760072160725381.219590e+07
Collier County41.512.1296.076.9283.063.95.124.204307511.76670916124953378488231636.194133e+07
Columbia County87.221.4173.053.0186.835.018.2511.391072016.5420971826541705031443.225476e+06
DeSoto County112.124.9305.263.9265.044.11.690.76876626.13734273528637489524.049290e+06
Dixie County96.226.8226.553.0200.935.94.392.38362724.43835517826116700759.079057e+05
Duval County143.029.3179.962.0234.442.99.217.0613806915.152105601467659501817721374.740938e+07
Flagler County116.323.5226.359.1238.844.18.664.661313712.052713180915111206763167.243310e+06
Gilchrist County96.226.8226.553.0200.935.94.392.38267516.14288025426018256316.418368e+05
Glades County77.122.1239.262.0210.840.09.967.09231218.94286517157313724421.684916e+06
Hamilton County57.717.0135.932.1132.820.04.392.38279124.03526237730914310537.796505e+05
Hardee County100.225.9372.175.0272.254.02.631.36602623.34005689334927245836.791781e+06
Hendry County115.127.8296.076.9283.063.912.618.84952523.938361124187241556214.864095e+06
Hernando County103.328.8186.855.9214.733.013.185.972577314.044710303126719086543126.987439e+06
Highlands County100.225.9372.175.0272.254.02.631.362005119.837445208878210542446161.104181e+07
Hillsborough County42.313.5134.848.0152.127.05.434.5021444215.5547417709379614368889902344.139342e+07
Indian River County107.024.0251.276.9240.549.910.868.111624910.651797500170215741379235.417251e+06
Lafayette County57.717.0135.932.1132.820.04.392.38157922.7415491606148732214.387254e+05
Lake County101.725.4248.160.0206.436.910.518.054302012.6514297452383356495151471.418739e+07
Lee County101.222.7214.773.0228.149.96.081.898584411.85419823806704754610380636.269670e+07
Levy County96.226.8226.553.0200.935.94.392.38832920.83727263670140770721.823098e+06
Manatee County116.225.2240.855.9234.342.07.085.714105710.85518911968028394855170331.369299e+07
Marion County60.522.0186.147.0156.132.10.440.235588016.2437727956019359977121301.934269e+07
Martin County101.213.7183.134.0188.922.016.329.431700210.958344653310316091285222.936391e+06
Miami-Dade County129.628.7269.062.9238.542.99.335.1845264916.749758141734334276158140637302.417464e+08
Monroe County129.628.7269.062.9238.542.99.335.18896311.863009409751175027158311.061900e+08
Nassau County115.828.5203.075.0218.542.910.059.0174849.17059018862618583249164.184525e+06
Okeechobee County77.122.1239.262.0210.840.02.631.36841521.842524102147741537722.920370e+06
Orange County107.125.5216.366.0244.944.110.367.4820152815.35402189817807138064517943465.216380e+07
Osceola County115.529.9268.069.0285.749.913.437.024889214.0492849207981367990205501.244058e+07
Palm Beach County145.031.0335.276.0273.051.13.961.9217086811.8600597686650514859419041678.307278e+07
Pasco County103.328.8186.855.9214.733.013.185.976763513.0512479330553539630206581.824896e+07
Pinellas County113.524.6242.063.9231.742.95.854.3211599012.251488441259459752807301595.632790e+07
Polk County77.533.5165.855.9157.936.98.066.2110784416.148328207796327080095241164.351088e+07
Putnam County160.035.5205.862.0254.745.118.2511.391895426.3343901925314741631641.110617e+07
Sarasota County160.035.5205.862.0254.745.19.217.06201188.3770227313073254261243681.306549e+07
Seminole County107.024.0251.276.9240.549.916.329.433983912.8499957030683321128331812.173058e+07
St. Johns County116.225.2240.855.9234.342.07.085.71380659.25842315773229426718154341.862228e+07
St. Lucie County107.825.9219.361.0251.144.111.289.245132111.26386517902542467832112322.350643e+07
Sumter County101.725.4248.160.0206.436.910.518.05106729.15793123124251287542173.424526e+06
Suwannee County57.717.0135.932.1132.820.04.392.38829920.344144851332441911242.045323e+06
Union County87.221.4173.053.0186.835.018.2511.39229122.24737334176314940335.774985e+05
Volusia County62.621.6148.155.0147.035.08.664.667987715.24691114864234547538266513.782572e+07
\n", - "
" - ], - "text/plain": [ - " wind_total wind_peak ... twt_peak dmg\n", - "county ... \n", - "Alachua County 87.2 21.4 ... 39 9.306765e+06\n", - "Baker County 81.0 18.6 ... 1 1.360526e+06\n", - "Bradford County 87.2 21.4 ... 1 3.246248e+06\n", - "Brevard County 130.9 28.7 ... 71 3.192891e+07\n", - "Broward County 128.2 31.7 ... 382 1.329551e+08\n", - "Charlotte County 112.1 24.9 ... 20 6.625988e+06\n", - "Citrus County 103.3 28.8 ... 13 5.568339e+06\n", - "Clay County 81.0 18.6 ... 8 1.219590e+07\n", - "Collier County 41.5 12.1 ... 63 6.194133e+07\n", - "Columbia County 87.2 21.4 ... 4 3.225476e+06\n", - "DeSoto County 112.1 24.9 ... 2 4.049290e+06\n", - "Dixie County 96.2 26.8 ... 5 9.079057e+05\n", - "Duval County 143.0 29.3 ... 137 4.740938e+07\n", - "Flagler County 116.3 23.5 ... 16 7.243310e+06\n", - "Gilchrist County 96.2 26.8 ... 1 6.418368e+05\n", - "Glades County 77.1 22.1 ... 2 1.684916e+06\n", - "Hamilton County 57.7 17.0 ... 3 7.796505e+05\n", - "Hardee County 100.2 25.9 ... 3 6.791781e+06\n", - "Hendry County 115.1 27.8 ... 1 4.864095e+06\n", - "Hernando County 103.3 28.8 ... 12 6.987439e+06\n", - "Highlands County 100.2 25.9 ... 16 1.104181e+07\n", - "Hillsborough County 42.3 13.5 ... 234 4.139342e+07\n", - "Indian River County 107.0 24.0 ... 23 5.417251e+06\n", - "Lafayette County 57.7 17.0 ... 1 4.387254e+05\n", - "Lake County 101.7 25.4 ... 47 1.418739e+07\n", - "Lee County 101.2 22.7 ... 63 6.269670e+07\n", - "Levy County 96.2 26.8 ... 2 1.823098e+06\n", - "Manatee County 116.2 25.2 ... 33 1.369299e+07\n", - "Marion County 60.5 22.0 ... 30 1.934269e+07\n", - "Martin County 101.2 13.7 ... 22 2.936391e+06\n", - "Miami-Dade County 129.6 28.7 ... 730 2.417464e+08\n", - "Monroe County 129.6 28.7 ... 31 1.061900e+08\n", - "Nassau County 115.8 28.5 ... 16 4.184525e+06\n", - "Okeechobee County 77.1 22.1 ... 2 2.920370e+06\n", - "Orange County 107.1 25.5 ... 346 5.216380e+07\n", - "Osceola County 115.5 29.9 ... 50 1.244058e+07\n", - "Palm Beach County 145.0 31.0 ... 167 8.307278e+07\n", - "Pasco County 103.3 28.8 ... 58 1.824896e+07\n", - "Pinellas County 113.5 24.6 ... 159 5.632790e+07\n", - "Polk County 77.5 33.5 ... 116 4.351088e+07\n", - "Putnam County 160.0 35.5 ... 4 1.110617e+07\n", - "Sarasota County 160.0 35.5 ... 68 1.306549e+07\n", - "Seminole County 107.0 24.0 ... 81 2.173058e+07\n", - "St. Johns County 116.2 25.2 ... 34 1.862228e+07\n", - "St. Lucie County 107.8 25.9 ... 32 2.350643e+07\n", - "Sumter County 101.7 25.4 ... 7 3.424526e+06\n", - "Suwannee County 57.7 17.0 ... 4 2.045323e+06\n", - "Union County 87.2 21.4 ... 3 5.774985e+05\n", - "Volusia County 62.6 21.6 ... 51 3.782572e+07\n", - "\n", - "[49 rows x 16 columns]" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from matplotlib import pyplot as plt\n", - "import seaborn as sns; sns.set()\n", - "\n", - "df = pd.read_csv('irma.csv')\n", - "df.set_index(keys='county',inplace=True)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fg-Ticrhdpih" - }, - "source": [ - "## Feature Selection\n", - "\n", - "We have 15 features, but do we really need them all? Especially when we only have 49 data points. Intuitively, some of these parameters should be redundant; 6 wind parameters surely can be summarized by just one parameter. The following heatmap illustrates this intuition well." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 508 - }, - "executionInfo": { - "elapsed": 2442, - "status": "ok", - "timestamp": 1603518988380, - "user": { - "displayName": "Angel Umana", - "photoUrl": "", - "userId": "02017183028986324110" - }, - "user_tz": 240 - }, - "id": "B-5M2buOeaQV", - "outputId": "253dbed1-de76-4c5e-df95-2e371597e692" - }, - "outputs": [], - "source": [ - "plt.figure(figsize=(12,7))\n", - "sns.heatmap(df.corr(),\n", - " annot=True,\n", - " fmt = '.2f',\n", - " cmap='coolwarm')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SWRPvSoZQ341" - }, - "source": [ - "We can see clusters of highly correlated parameters, such as gust and max wind, or GDP, population, poverty population, and Twitter activity (i.e. parameters that scale with population). Let's find these redundancies quantitivately. Specifically, we will use **Variance Inflation Factor (VIF)**. The VIF of an independent variable is a measure of how much its variation can be attributed to other independent variables. The higher it is, the more redundant the (not so) independent variable is. VIF is calculated as follows:\n", - "\n", - "$$ VIF = \\frac{1}{1 - R^2_i} $$\n", - "\n", - "where $R_i^2$ is the $R^2$ of the multilinear regression model of the $i^{\\text{th}}$ independent variable using the other independent variables as predictors. Why not just use $R^2_i$ to measure redundancy? One intuition is that the inverse relationship will harshly penalize smaller and smaller values of $1 - R^2_i$ (the so-called **tolerance**).\n", - "\n", - "We will loop through the independent variables, construct a multilinear regressor for each one, and compute and collect the VIF's. Standard practice is to discard predictors with VIF's over 10. 5 is used sometimes too, and we'll try both threshholds and see how well they filter out our data in the models." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 284 - }, - "executionInfo": { - "elapsed": 835, - "status": "ok", - "timestamp": 1603522001992, - "user": { - "displayName": "Angel Umana", - "photoUrl": "", - "userId": "02017183028986324110" - }, - "user_tz": 240 - }, - "id": "nnUg2sP6-DIk", - "outputId": "6657d1ae-9ada-45d6-a23a-c04866775f8c" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'wind_total': 8.387395886294627,\n", - " 'wind_peak': 4.817369273234568,\n", - " 'gust_total': 7.567335957063802,\n", - " 'gust_peak': 12.338585234562844,\n", - " 'maxwind_total': 19.164251074726092,\n", - " 'maxwind_peak': 20.868875746207838,\n", - " 'precip_total': 16.29621857409362,\n", - " 'precip_peak': 16.206418387213727,\n", - " 'poverty_pop': 151.20740399956702,\n", - " 'poverty_rate': 5.049258457172382,\n", - " 'mhi': 5.406652921497216,\n", - " 'gdp': 52.95007972702599,\n", - " 'population': 160.54620481443504,\n", - " 'twt_total': 252.13557403271057,\n", - " 'twt_peak': 276.13105651634396}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.preprocessing import robust_scale\n", - "\n", - "# preprocess data\n", - "features = df.drop(labels = ['dmg'], axis = 1).columns\n", - "X = df[features].apply(robust_scale)\n", - "Y = robust_scale(df['dmg']) # get a weird error if I try .apply\n", - "\n", - "vif_dict = {}\n", - "# run linear regression for each feature and calculate VIF\n", - "for feature in features:\n", - " X_i = X.drop(labels = [feature], axis = 1)\n", - " Y_i = X[feature]\n", - " reg = LinearRegression()\n", - " reg.fit(X_i, Y_i)\n", - " r2 = reg.score(X_i, Y_i)\n", - " vif = 1. / (1 - r2)\n", - " vif_dict[feature] = vif\n", - "vif_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 124 - }, - "executionInfo": { - "elapsed": 797, - "status": "ok", - "timestamp": 1603522548691, - "user": { - "displayName": "Angel Umana", - "photoUrl": "", - "userId": "02017183028986324110" - }, - "user_tz": 240 - }, - "id": "vbIlotUwTksT", - "outputId": "beee4687-e22c-41ac-9642-e49ce4482e2e" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "wind_total 8.387396\n", - "wind_peak 4.817369\n", - "gust_total 7.567336\n", - "poverty_rate 5.049258\n", - "mhi 5.406653\n", - "dtype: float64" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vif_s = pd.Series(data = list(vif_dict.values()), index=list(vif_dict.keys()))\n", - "vif_s[vif_s < 10]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jFmHET7jZGLm" - }, - "source": [ - "So... using a tolerance of 10, the VIF test says we should use three wind parameters, poverty rate, and mhi (the last two being strongly correlated with each other; see heat map). This does not look like it will work out well. Let's see:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "executionInfo": { - "elapsed": 614, - "status": "ok", - "timestamp": 1603523340594, - "user": { - "displayName": "Angel Umana", - "photoUrl": "", - "userId": "02017183028986324110" - }, - "user_tz": 240 - }, - "id": "0ONf_rgoZhOS", - "outputId": "ed70293f-8ea6-455f-856c-ce9134e61a18" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best CV r2: 0.25\n" - ] - } - ], - "source": [ - "from sklearn.model_selection import cross_validate\n", - "\n", - "vif_features = vif_s[vif_s < 10].index.values\n", - "\n", - "reg = LinearRegression()\n", - "cv_scores = cross_validate(reg,\n", - " X[vif_features], Y,\n", - " scoring = 'r2',\n", - " cv = 5)\n", - "best_score = np.max(cv_scores['test_score'])\n", - "print('Best CV r2: %.2f' % best_score)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "M7zFHveFb8GG" - }, - "source": [ - "By using only `wind_total`, `wind_peak`, `gust_total`, `povery_rate`, and `mhi`, the best $R^2$ from 5-fold cross-validation was 0.25. Not so hot. Let's rethink the multicollinearity problem. Looking back at the heat map,there's 4 distinct clusters:\n", - "\n", - "- Wind: `wind_total`, `wind_peak`, `gust_total`, `gust_peak`, `maxwind_total`, `maxwind_peak`\n", - "- Precipitation: `precip_total`, `precip_peak`\n", - "- Socioeconomic: `poverty_rate`, `mhi`\n", - "- Population: `poverty_pop`, `gdp`, `population`, `twt_total`, `twt_peak`\n", - "\n", - "Looking back at this I wonder two things:\n", - "\n", - "1. Why are there so many wind features?\n", - "2. Why the sum of wind features?\n", - "\n", - "The answer to #1 is actually easy: I don't know which one of average winds, gusts (i.e. sudden, fast winds), and maximum sustained winds (i.e. the highest average wind speed that is sustained over 1-minute intervals) is the best predictor for hurricane damages. But in retrospect, the popular Saffir-Simpson scale for classifying hurricanes in the US uses maximum sustained wind speed, so I should just use that.\n", - "\n", - "For #2, this is a good question. There's really no good reason, physically or analytically, to use the sum. What does the sum of average winds over a time period represent? The distance wind travelled? That doesn't seem physically significant. Furthermore, not every county has the same amount of samples of wind data, so I can't even say that it's a proxy of averages.\n", - "\n", - "So, the next step right now will be to look back at the weather data and\n", - "\n", - "- Remove `wind_` and `gust_` features.\n", - "- Replace `maxwind_total` with `maxwind_mean`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "authorship_tag": "ABX9TyNxZsCYKaph/u64/+lf7Ls+", - "collapsed_sections": [], - "mount_file_id": "1Q4J-Du4O02VX-aMhBgnqA8CcQVFolx1x", - "name": "irma_modeling.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3 (Spyder)", - "language": "python3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} -- 2.43.0