mirror of
https://github.com/youronlydimwit/Data_ScienceUse_Cases.git
synced 2025-12-14 02:40:02 +01:00
1314 lines
457 KiB
Plaintext
1314 lines
457 KiB
Plaintext
|
|
{
|
|||
|
|
"cells": [
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "a789c814",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"# Load Basic Libraries\n",
|
|||
|
|
" Load some libaries to read and display the data"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 1,
|
|||
|
|
"id": "5d072c3f",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"import pandas as pd"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 2,
|
|||
|
|
"id": "947c545a",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>class</th>\n",
|
|||
|
|
" <th>cap-shape</th>\n",
|
|||
|
|
" <th>cap-surface</th>\n",
|
|||
|
|
" <th>cap-color</th>\n",
|
|||
|
|
" <th>bruises</th>\n",
|
|||
|
|
" <th>odor</th>\n",
|
|||
|
|
" <th>gill-attachment</th>\n",
|
|||
|
|
" <th>gill-spacing</th>\n",
|
|||
|
|
" <th>gill-size</th>\n",
|
|||
|
|
" <th>gill-color</th>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <th>stalk-surface-below-ring</th>\n",
|
|||
|
|
" <th>stalk-color-above-ring</th>\n",
|
|||
|
|
" <th>stalk-color-below-ring</th>\n",
|
|||
|
|
" <th>veil-type</th>\n",
|
|||
|
|
" <th>veil-color</th>\n",
|
|||
|
|
" <th>ring-number</th>\n",
|
|||
|
|
" <th>ring-type</th>\n",
|
|||
|
|
" <th>spore-print-color</th>\n",
|
|||
|
|
" <th>population</th>\n",
|
|||
|
|
" <th>habitat</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>x</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>t</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>f</td>\n",
|
|||
|
|
" <td>c</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>k</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>o</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>k</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>u</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>e</td>\n",
|
|||
|
|
" <td>x</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>y</td>\n",
|
|||
|
|
" <td>t</td>\n",
|
|||
|
|
" <td>a</td>\n",
|
|||
|
|
" <td>f</td>\n",
|
|||
|
|
" <td>c</td>\n",
|
|||
|
|
" <td>b</td>\n",
|
|||
|
|
" <td>k</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>o</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>g</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>e</td>\n",
|
|||
|
|
" <td>b</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>t</td>\n",
|
|||
|
|
" <td>l</td>\n",
|
|||
|
|
" <td>f</td>\n",
|
|||
|
|
" <td>c</td>\n",
|
|||
|
|
" <td>b</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>o</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>m</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>3</th>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>x</td>\n",
|
|||
|
|
" <td>y</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>t</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>f</td>\n",
|
|||
|
|
" <td>c</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>o</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>k</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>u</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>4</th>\n",
|
|||
|
|
" <td>e</td>\n",
|
|||
|
|
" <td>x</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>g</td>\n",
|
|||
|
|
" <td>f</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>f</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>b</td>\n",
|
|||
|
|
" <td>k</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>s</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>p</td>\n",
|
|||
|
|
" <td>w</td>\n",
|
|||
|
|
" <td>o</td>\n",
|
|||
|
|
" <td>e</td>\n",
|
|||
|
|
" <td>n</td>\n",
|
|||
|
|
" <td>a</td>\n",
|
|||
|
|
" <td>g</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>5 rows × 23 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" class cap-shape cap-surface cap-color bruises odor gill-attachment \\\n",
|
|||
|
|
"0 p x s n t p f \n",
|
|||
|
|
"1 e x s y t a f \n",
|
|||
|
|
"2 e b s w t l f \n",
|
|||
|
|
"3 p x y w t p f \n",
|
|||
|
|
"4 e x s g f n f \n",
|
|||
|
|
"\n",
|
|||
|
|
" gill-spacing gill-size gill-color ... stalk-surface-below-ring \\\n",
|
|||
|
|
"0 c n k ... s \n",
|
|||
|
|
"1 c b k ... s \n",
|
|||
|
|
"2 c b n ... s \n",
|
|||
|
|
"3 c n n ... s \n",
|
|||
|
|
"4 w b k ... s \n",
|
|||
|
|
"\n",
|
|||
|
|
" stalk-color-above-ring stalk-color-below-ring veil-type veil-color \\\n",
|
|||
|
|
"0 w w p w \n",
|
|||
|
|
"1 w w p w \n",
|
|||
|
|
"2 w w p w \n",
|
|||
|
|
"3 w w p w \n",
|
|||
|
|
"4 w w p w \n",
|
|||
|
|
"\n",
|
|||
|
|
" ring-number ring-type spore-print-color population habitat \n",
|
|||
|
|
"0 o p k s u \n",
|
|||
|
|
"1 o p n n g \n",
|
|||
|
|
"2 o p n n m \n",
|
|||
|
|
"3 o p k s u \n",
|
|||
|
|
"4 o e n a g \n",
|
|||
|
|
"\n",
|
|||
|
|
"[5 rows x 23 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 2,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"df = pd.read_csv(\"https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/main/Classification/Data/mushrooms.csv\")\n",
|
|||
|
|
"df.head()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "ed66f9da",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"# Data Reformatting\n",
|
|||
|
|
"Here we can see all the data types are textual (object). We need to convert all of this as numerics (in this case, integers)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 3,
|
|||
|
|
"id": "64fa324d",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>Column_Name</th>\n",
|
|||
|
|
" <th>Data_Type</th>\n",
|
|||
|
|
" <th>Missing_Data</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>class</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>cap-shape</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>cap-surface</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>3</th>\n",
|
|||
|
|
" <td>cap-color</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>4</th>\n",
|
|||
|
|
" <td>bruises</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>5</th>\n",
|
|||
|
|
" <td>odor</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>6</th>\n",
|
|||
|
|
" <td>gill-attachment</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>7</th>\n",
|
|||
|
|
" <td>gill-spacing</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>8</th>\n",
|
|||
|
|
" <td>gill-size</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>9</th>\n",
|
|||
|
|
" <td>gill-color</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>10</th>\n",
|
|||
|
|
" <td>stalk-shape</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>11</th>\n",
|
|||
|
|
" <td>stalk-root</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>12</th>\n",
|
|||
|
|
" <td>stalk-surface-above-ring</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>13</th>\n",
|
|||
|
|
" <td>stalk-surface-below-ring</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>14</th>\n",
|
|||
|
|
" <td>stalk-color-above-ring</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>15</th>\n",
|
|||
|
|
" <td>stalk-color-below-ring</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>16</th>\n",
|
|||
|
|
" <td>veil-type</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>17</th>\n",
|
|||
|
|
" <td>veil-color</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>18</th>\n",
|
|||
|
|
" <td>ring-number</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>19</th>\n",
|
|||
|
|
" <td>ring-type</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>20</th>\n",
|
|||
|
|
" <td>spore-print-color</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>21</th>\n",
|
|||
|
|
" <td>population</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>22</th>\n",
|
|||
|
|
" <td>habitat</td>\n",
|
|||
|
|
" <td>object</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" Column_Name Data_Type Missing_Data\n",
|
|||
|
|
"0 class object 0\n",
|
|||
|
|
"1 cap-shape object 0\n",
|
|||
|
|
"2 cap-surface object 0\n",
|
|||
|
|
"3 cap-color object 0\n",
|
|||
|
|
"4 bruises object 0\n",
|
|||
|
|
"5 odor object 0\n",
|
|||
|
|
"6 gill-attachment object 0\n",
|
|||
|
|
"7 gill-spacing object 0\n",
|
|||
|
|
"8 gill-size object 0\n",
|
|||
|
|
"9 gill-color object 0\n",
|
|||
|
|
"10 stalk-shape object 0\n",
|
|||
|
|
"11 stalk-root object 0\n",
|
|||
|
|
"12 stalk-surface-above-ring object 0\n",
|
|||
|
|
"13 stalk-surface-below-ring object 0\n",
|
|||
|
|
"14 stalk-color-above-ring object 0\n",
|
|||
|
|
"15 stalk-color-below-ring object 0\n",
|
|||
|
|
"16 veil-type object 0\n",
|
|||
|
|
"17 veil-color object 0\n",
|
|||
|
|
"18 ring-number object 0\n",
|
|||
|
|
"19 ring-type object 0\n",
|
|||
|
|
"20 spore-print-color object 0\n",
|
|||
|
|
"21 population object 0\n",
|
|||
|
|
"22 habitat object 0"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 3,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Missing data check function\n",
|
|||
|
|
"def completeness_check(input_df):\n",
|
|||
|
|
" # Create a new DataFrame\n",
|
|||
|
|
" summary_df = pd.DataFrame(columns=['Column_Name', 'Data_Type', 'Missing_Data'])\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Fill in the data\n",
|
|||
|
|
" summary_df['Column_Name'] = input_df.columns\n",
|
|||
|
|
" summary_df['Data_Type'] = input_df.dtypes.values\n",
|
|||
|
|
" summary_df['Missing_Data'] = input_df.isnull().sum().values\n",
|
|||
|
|
"\n",
|
|||
|
|
" return summary_df\n",
|
|||
|
|
"\n",
|
|||
|
|
"completeness_check(df)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "737ffd3b",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"Here I am using my own script to display either a histogram or bar chart to the whole dataset."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 4,
|
|||
|
|
"id": "c571ce47",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import wget"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 5,
|
|||
|
|
"id": "aa20d4af",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"\r",
|
|||
|
|
" 0% [ ] 0 / 1700\r",
|
|||
|
|
"100% [................................................................................] 1700 / 1700"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"'data_desc_graph.py'"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 5,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"wget.download('https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/main/Scripts/data_desc_graph.py',\n",
|
|||
|
|
" 'data_desc_graph.py')"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 6,
|
|||
|
|
"id": "f09fc246",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"from data_desc_graph import data_desc_graph"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "0126adc1",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"Execution of code"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 7,
|
|||
|
|
"id": "14f49813",
|
|||
|
|
"metadata": {
|
|||
|
|
"scrolled": false
|
|||
|
|
},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABDAAAAsKCAYAAABAPfo0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOzde7ylZV3//9dbQEAFARkQZoAhxRKoRpkIswN5CLISrLCxEuyrjiKmlpWMVqI5aqViaGJ4CPCEk1oQggYo+rM4OCKCgATKaWCEUUQGDyjj5/fHfe1msWftPXvP7L3X2nu/no/H/VhrXffpuu+11rXu9bmvQ6oKSZIkSZKkYfaQQWdAkiRJkiRpcwxgSJIkSZKkoWcAQ5IkSZIkDT0DGJIkSZIkaegZwJAkSZIkSUPPAIYkSZIkSRp6BjCkzUhSE5hubsuenmTNBLb5vLbe4unO/9ZIcnGSi7dgvV2SnJTkidOQrSnX8jorx5Qe7zOX5Gntc3b4qOVvnuQ+TkrylK3K6CyW5ElJLkvyvXY+l0zjvm5u+3h9n3lvmMWf02kr85IcPvpzPsZyJ40qt9cnuSHJh5McsZX7PynJtF9T9Xw+Rqb7klyV5E+TZLr335OPfZO8s52/H7Z8fDHJa5I8si2zuOXxBTOQnyXtPdhtCrf56iS3JnkgyZVTtd1x9ndgkn9NckuS+5N8N8n/l+RlSXaY5LZm7DO5tabz93ei5U77XRz5Tv2knftrk7wvyZO2Yv9HJ/nzLV1fUn9DX7BJQ+BJo6ZvAp8elfasSW7zk229tVOXzWnxkjZN1i7Aa4FZEcCYZ/6OyX9eXwvM2wAG8D5gW+B36L63/zsD+/yzJAtmYD8zZZjKvF+my8tRwJuBXYFPJfnAFv7hO5zuOzJT11S9vz+/C1wMnALMyB+lJL8KXAX8Rtvvb7Z8/CdwAnDSTORjlCV078GUBDCSHAqsBM4CfhV47lRsd5z9HQNcAfwsXRn9G8BzgP8BXge8aJKbPJyZ/UxujffSfZYHbR1dPn6J7vP8TuDxwP8kedMWbvNoZuh7Kc0n2w46A9Kwq6pLe18nuR/41uj0SW5zHd2P5VBKsn1V3V9V1w46L5szktdB52O2qKqvDzoPkzHo97f9of1pYGVVfWYKthdgu6r60TiLfQ44DDgReOXW7nOQkmwHPDBkZd5lVfVAz+v3Jfkz4G3AlcBbB5KriRv9+/Nfrbbbs5mCvPe8Z5vcFU+yK/Ax4DrgaVX1vVH5eCvdH8AZkWQbYDpqnjy+Pb67qr6xtRsbrxxLcgBwJnA+cMyoz+Z5Sd4CPG5r8zBseq4z1gCbrbk6A3406nt1UZJTgZOBE5OsrqqPDyhvknrMhsisNOskeUKr+vn9VsX2xaPmb1KtMckfJvlyq4r73SRXJxn3rkuSxyX59yR3tWq8tyb5tyTb9iyzIMm7ktzWqqXe1u40bt/mj1SrPjjJp5PcB6xq8x7UhCQbq2r/Xqty+Z0k9yb5UJJHtWUWAze1Vd7TUy3zeeMcx+lJ1iT5pVYN+Yfpqkr/6Rjn7Vfbcd4DXNbm7ZyuSvMd7TivT/Jn7Q9jv/fmh0luT/I3jLoAzsaqz88bld63qnqSZyX57/be3Zvk8iTP7Jm/bZIVSb7W8nZHkremp1pwW+bvkny95e1bSb6Q5JfHOm9bIqOakGxuv9lYtfc1Pe/lST3r/3GSr/Ss+4Eke43a58OSnJrk2+mq7f97e68fdI57PgdPSvI/SX4A/EObtyzJZ5Ksa+f5y0mO63N8la6pxSvTVcX+XpJPJtmjTavSfb9uS/KqzZyr5wEb6H4r/yY9zcUmcew3J/lgkv+X5GvAj4DfGm+/dBfzpwIvSbJwM3l80PvR0jb5/Pac26Uj57Z9R36rzf/zltd7k5ydUbU/JvgZHtnvS5L8Q5I7gPuBXTJGVe4kL0xyRcvPd5J8Lskv9cx/XZv/3XaOP5PksM2cv0mrqpOBLwOv6Nn3DklOTvLV9pn7ZpL/TPIzPcucRHenG+DHI9+Rmc4/cC+wXW9CkpcmuSTJ3UnuSXLpyPvds8yY79kY+3khsAD401HBCwCq6ntVdcGo5G2SvD7J2paP/0yyaFQ+JvP9XpnkxCQ30X2f/hT417bIDdlYTi1u67w8yXU9n7HVScashZbuN+/09vLrvd+xTOB3Jht/J343yXuSrAPuHGt/wJ/R3VB8yajgBdDd8Kiq/27bnorP5MOS/H2Sm5L8qD2+JqNqHyV5Yjb+Vt6WrknN6zKqucfWnpP0aUKSrrx5VbpmHD9sn4tPjRznRM7DVGhBvL9qeX1FT/4WJPmXJP+b7jrvtnRN0Rb2LHM6cBywsOczefNM5l+aq6yBIU29nYEPA28HXg/8CXBqkuur6rP9Vkj3Z/GDdNVx/5LuD9PPMPZF5IhzgXuA44FvAQuBZ7T1R+6W/Q9dtdo30FX73YOu6vRD6S5UR5xNV1X+74GfbGa/bwcupKviegDwRmBv4Nfpqoj/LvAJ4E3AOW2dzd353xn4aNv/jcAy4JQk66vq9FHLfgj4CPD7wLbtwuuTdE1W/ha4mu5P4tvoLrZf3c7H7sBn6JoBHdeO/y+BfTeTtzGlC7KcAvxH2+Z9LR+Lexb7IF3zg7+nez8eT1dNeDHwe22ZV9FdyL6G7i7wzsBSJlglOj1Bqx4TCVJvbr9PAi6hu6D/l5a2pu1zeUv7KLCC7jPwRuAXkzyxqu5ry58GHENXtXw18FS697CfR9JV234L3fv2g5b+U3R3ft9M9/n8VeC9SXasqneP2sZzga/SNX/ak+7zeiawE91dzpH8vDnJ1VV13hh5+SRdc4Mv0H033kv7zkzi2KH7Xiyhqwp+F3DzGPvr9UbgBcDfAC/ezLITtTPdeXgLcAfde/7xJP9Md4f3BDaer3+mu6M/YiKf4RGvAb4ILAe2AX7YLzPp7iy/ku7cvpbufT2M7vv4P22xhXR3QNcADwf+GPh8kqVVddWkz8D4zgdenWTfqroV2J7uM/MGunJtN7rP1KVJfqaqvkn3mVgEPJ/us7Jh1DanI//p+b7vRPe+PB3461HLLW75u5mNTaDOTfKMqjp/1LITes+ApwHfrKrVk8jvCrr38//R/f68le77/2s9y0zm+/084BvAXwDfows87UZ3/Mew8W7+2iR/1Pb3euD/A3YEfo7xy9WX0L1PK+h+y9YCayb6O9PjHXSfqecC4/Vh8TTgi1U1keZVW/WZbJ+bTwMH0n1/r6b7zv1N29Yr23K7AxfRlRPH0gWK/owH/64xjefkLLrmF2+nu9bYge4zsRfwtQmehylRVT9KchHw+0m2bUGm3ei+IyvoapbtTXfu/rvt/4d053cB8AvAyA2NkWuuGcu/NCdVlZOT0yQmuovBD44x73SggF/vSdueLrhwWk/a89pyi9vrvwDunmQ+dm/beOY4y7ye7uLlCeMsc1Lbzsv7zLsYuLjn9eFt2U+NWu6PWvpT2+vF7fULJngsI+dt2aj0C4BbgIw6byePWu63W/rzRqWP/OHcvb1eSXchtm/PMg9v70/1pC0eY3sjx394e70zsB74xDjH9ittnWPHOGdL2utzx9vOBM7deNPho5a/uef1ZvfbtvGGUWnb0N2V+uyo9F9uy7+svf5puj8kfzVquVNGn+OeYzlqM/l5CN0fsvcAX+mT1/8Ftu1Je1tL/+uetG3pggn/upl9bdvWPWmyx97Sbga+Dzx6gu/nzbTyhe4C+EfAY9rrN/R+TnuO96RRaZt8fnvO7a/2pP1cS7se2GbU+frxSNokPsMj+72C9p3tWfZ5PLjMeyxd2fS2SXzWt2nvx/XAP431vRxn/ZPactuOMf9Fbf4vjrP/h9F95/9sotvdXP4nM7XPR7/v+Gmjz/kY35n/As7u81nZ5D0bYzvXAZdMMK8j2/7cqPS/aOl7byavY32/7wB2HOPz9dhR6e8ErtiC8/yC3s9rS5vo78zI5/HfJ7ivHwAf2cLPw6Q+k3SBgweVAy39NXRlzR7t9Rvb60U9y+xIV+7VVJ6Tkbz2vH4Ko8rRrTgPI5+LxZtZ/3RgzTjz39S2s+c4+9+
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 1080x2880 with 24 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {
|
|||
|
|
"needs_background": "light"
|
|||
|
|
},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/plain": [
|
|||
|
|
"'Dependencies Satisfied'"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 7,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"data_desc_graph(df)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "efbc9d2d",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Encoding the unique values from the dataset"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 8,
|
|||
|
|
"id": "54a1fd3b",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Using LabelEncoder function from sklearn\n",
|
|||
|
|
"from sklearn.preprocessing import LabelEncoder"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 9,
|
|||
|
|
"id": "27f56a20",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Iterate every column\n",
|
|||
|
|
"for column in df.columns:\n",
|
|||
|
|
" le = LabelEncoder()\n",
|
|||
|
|
" df[column] = le.fit_transform(df[column])"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 10,
|
|||
|
|
"id": "caed560f",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>class</th>\n",
|
|||
|
|
" <th>cap-shape</th>\n",
|
|||
|
|
" <th>cap-surface</th>\n",
|
|||
|
|
" <th>cap-color</th>\n",
|
|||
|
|
" <th>bruises</th>\n",
|
|||
|
|
" <th>odor</th>\n",
|
|||
|
|
" <th>gill-attachment</th>\n",
|
|||
|
|
" <th>gill-spacing</th>\n",
|
|||
|
|
" <th>gill-size</th>\n",
|
|||
|
|
" <th>gill-color</th>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <th>stalk-surface-below-ring</th>\n",
|
|||
|
|
" <th>stalk-color-above-ring</th>\n",
|
|||
|
|
" <th>stalk-color-below-ring</th>\n",
|
|||
|
|
" <th>veil-type</th>\n",
|
|||
|
|
" <th>veil-color</th>\n",
|
|||
|
|
" <th>ring-number</th>\n",
|
|||
|
|
" <th>ring-type</th>\n",
|
|||
|
|
" <th>spore-print-color</th>\n",
|
|||
|
|
" <th>population</th>\n",
|
|||
|
|
" <th>habitat</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>6</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>7</td>\n",
|
|||
|
|
" <td>7</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>3</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>9</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>7</td>\n",
|
|||
|
|
" <td>7</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>3</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>8</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>3</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>7</td>\n",
|
|||
|
|
" <td>7</td>\n",
|
|||
|
|
" <td>0</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>4</td>\n",
|
|||
|
|
" <td>3</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>3</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>3 rows × 23 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" class cap-shape cap-surface cap-color bruises odor gill-attachment \\\n",
|
|||
|
|
"0 1 5 2 4 1 6 1 \n",
|
|||
|
|
"1 0 5 2 9 1 0 1 \n",
|
|||
|
|
"2 0 0 2 8 1 3 1 \n",
|
|||
|
|
"\n",
|
|||
|
|
" gill-spacing gill-size gill-color ... stalk-surface-below-ring \\\n",
|
|||
|
|
"0 0 1 4 ... 2 \n",
|
|||
|
|
"1 0 0 4 ... 2 \n",
|
|||
|
|
"2 0 0 5 ... 2 \n",
|
|||
|
|
"\n",
|
|||
|
|
" stalk-color-above-ring stalk-color-below-ring veil-type veil-color \\\n",
|
|||
|
|
"0 7 7 0 2 \n",
|
|||
|
|
"1 7 7 0 2 \n",
|
|||
|
|
"2 7 7 0 2 \n",
|
|||
|
|
"\n",
|
|||
|
|
" ring-number ring-type spore-print-color population habitat \n",
|
|||
|
|
"0 1 4 2 3 5 \n",
|
|||
|
|
"1 1 4 3 2 1 \n",
|
|||
|
|
"2 1 4 3 2 3 \n",
|
|||
|
|
"\n",
|
|||
|
|
"[3 rows x 23 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 10,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"df.head(3)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "839045a6",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Run this code again to check if all the values have changed\n",
|
|||
|
|
"# data_desc_graph(df)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "871a7563",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"# Research Questions:\n",
|
|||
|
|
"- What types of machine learning models perform best on this dataset?\n",
|
|||
|
|
"- Which features are most indicative of a poisonous mushroom?"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 11,
|
|||
|
|
"id": "770f251e",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import matplotlib.pyplot as plt\n",
|
|||
|
|
"import seaborn as sns"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "322b03d4",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"*Note that veil-type only contain 1 unique values."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 13,
|
|||
|
|
"id": "36d861f1",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAHlCAYAAAD8yFanAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOydd3gU1frHP2dLsum9kJAAoYTem4KAdBWliQ0EpKrYvVYU6QrSBaUpSFGpUpVeQu+hE2pIAum9J7t7fn/MpmyyIYF478/r3c/z5Ekyc945Zc6ceee0r5BSYsWKFStWrFix8k9E9f+dACtWrFixYsWKlX8XVkfHihUrVqxYsfKPxeroWLFixYoVK1b+sVgdHStWrFixYsXKPxaro2PFihUrVqxY+cdidXSsWLFixYoVK/9YrI6OFStW/nYIIYYKIQ5Xwv5PIcSQvzJN/2mEEIFCiAwhhPr/Oy1WrPw3Y3V0rFixYhEhxCtCiNOml220yXlo//+drpIIIcYLIVYVPyalfEpK+fO/Ia7lQggphHiuxPE5puNDK3idcCFE1weFkVJGSCkdpZSGSiTZipX/eayOjhUrVkohhPgAmANMBXyAQOB7oPcjXEtTkWP/RVwHCnuLTHkZANz6qyL4Ly8fK1b+VlgdHStWrJghhHABJgJjpJQbpZSZUsp8KeVWKeVHpjC2pl6M+6afOUIIW9O5TkKIKCHEJ0KIGGCZqddlvRBilRAiDRgqhHARQvxo6i26J4SYXNYwjRBirhAiUgiRJoQ4I4R4wnS8J/A58KKp5+m86fgBIcQI098qIcQXQoi7Qog4IcQKUx4RQlQ39cQMEUJECCEShBBjyymirUA7IYSb6f+ewAUgplh6awoh9gkhEk3XXC2EcDWdW4niOG41pfnjYukYLoSIAPYVO6YRQribyvRZ0zUchRA3hRCDH+LWWrHyP4nV0bFixUpJHgN0wO8PCDMWaAs0BZoArYEvip33BdyBasAo07HewHrAFVgN/AzogVpAM6A7MKKM+E6Z4nIHfgHWCSF0UsodKL1Oa0zDPE0s2A41/TwJBAGOwPwSYdoDwUAXYJwQot4D8p4DbAFeMv0/GFhRIowAvgb8gHpAADAeQEr5KhABPGtK8/Ridh1N4XsUv5iUMgkYBiwRQngDs4FQKWXJeK1YsVICq6NjxYqVkngACVJK/QPCDAQmSinjpJTxwATg1WLnjcBXUspcKWW26dgxKeUmKaURcAaeAt4z9RjFoby8X8ICUspVUspEKaVeSjkTsEVxTCrCQGCWlPK2lDID+Ax4qcTw0AQpZbaU8jxwHsV5exArgMGmnqGOwKYS6b0ppdxtyn88MMsUrjzGm8oju+QJKeUuYB2wF3gGGF2B61mx8j+PdRzYihUrJUkEPIUQmgc4O37A3WL/3zUdKyBeSplTwiay2N/VAC0QLYQoOKYqEaYQIcSHKL09foBEcZQ8y89KmWnVoMw9KiCm2N9ZKL0+ZSKlPCyE8ELpxdompcwulg9MvS7zgCcAJ5S8JVcgrRbzX4zFwFvAVCllYgWuZ8XK/zzWHh0rVqyU5BjK8EyfB4S5j+KsFBBoOlaAtGBT/FgkkAt4SildTT/OUsoGJY1M83E+AV4A3KSUrkAqyvBQWXGVl1Y9EFuOXXmsAj6k9LAVKMNWEmgspXQGBlGUXig7zWXmxTR/aZEpvjeEELUeJdFWrPyvYXV0rFixYoaUMhUYBywQQvQRQtgLIbRCiKeEEAXzSX4FvhBCeAkhPE3hV5V1TQtxRAO7gJlCCGfThOGaQghLwztOKI5JPKARQoxD6dEpIBaoLoQoqz37FXhfCFFDCOFI0ZyeBw3NVYR5QDcgpIw0ZwApQgh/4KMS52NR5gs9DJ+bfg8DZgArrHvsWLFSPlZHx4oVK6WQUs4CPkAZmolH6YF5i6K5KJOB0yirjS4CZ03HHobBgA1wBWVYZz1QxUK4ncCfKMu676L0NhUf4lln+p0ohDhrwf4nYCWKQ3LHZP/2Q6a1FFLKJCnlXimlpV6YCUBzlJ6n7cDGEue/RnEUU4QQ/yovLiFEC5T7Mdi0r840lN6fTyuTBytW/hcQlp9RK1asWLFixYqV/36sPTpWrFixYsWKlX8sVkfHihUrVqxYsfJvRwjxk2nTzktlnBdCiHmmzTAvCCGa/xXxWh0dK1asWLFixcp/guUoO4mXxVNAbdPPKOCHvyJSq6NjxYoVK1asWPm3I6UMAZIeEKQ3sEIqHAdchRCWFig8FFZHx4oVK1asWLHyd8Af8xWVUaZjlcK6M7KVB7JdG1ypZXm2Zy48sm0V+wc5/uVzOd6n/EBl8MPsU5WK+5VRbR7ZtmlASqXi/uG3vEe2/b7h6krF3W1Fy0e2HTTmiUrFHbL/XqXs7RxtH9lWZ6etVNw1gpzLD1QG6kp+rr73nCg/UBmcvV65zZk3H3/0fF8+U94m0g9m/Nv2j2zrlV+5upZmU9FNvUtTu2a1R79h5VDZ9r6X/vpoirTtABZLKRc/xCUs5a3SS8Otjo4VK1asWLFiBaGtnA8l8+ViFJmSRyUKRQC3gKqY77j+SFj30fkvRQgxHsiQUs74N0XxE9Ar/dJ1r5Bmz1oMUH/2WLx7dsSQncP54Z+Sdu4KAF7dnyBlYA9mLF1InhS07tSP7n2Hm9lKKVm3bBqXzx7CxlbHq2MmERhUH4BPRnQiOzMdgcTZxZWlK9ab2UZF3mX+nGncvnmDVwYPp0//Ih3Irb+vY/fObaSkJJGXZ8DV05/+I7/Gr3opZQGS46NY+8OHZGWm4FetPv1HTUOjseHq2b3UdMugQ/vW5OTkMvW7K9yN9Spl/8mbQTSp70RmlgGAbxbc5mZ4Jmn3V+FkE4ajg46xX31DjLEBienmtikJkWz76QNyslLxCajP00Omo9bYEHZ2B7tWf4pBr8fZxY2Pxs8hoFrNQrv7UeEsnjuJ8FthDHj1dZ7pOwiAvLxcJn/2OqNGvEaLFi3RGzWs2KEhIqb05r+vPedE9SoaEILYRD0/bkonN1/ioD/N9ZNzMOr1PP9cL4Z1bYvh2olCu1O37/Heyp34uzsB0Ll+DV7vUtSDo27ckWhbH3Jy9Uz7PpHrtzNLxf35O3Vo0tCFzEwlXVPnXefmnUzatXJDnbeJY0cOotPpeGbIN2hcS9+zcwdXcfbAz6QmRPDG18ewc3QH4PFgiZs2kazMVMZNmE2OY38c3OqY2Y7s50qNqjYIICZBz8L1yeTmSexsBVPf8cbNWdlk+M+jWWw5VEpTkyHPOFK9ikaxTzKwbGs6ufnw9gtONAiywSghMcXAr3tyuH3fYGb7Snc7An3UCCAu2cjKnVnk5SvPgSFiLjcuHcZWp6PTC1+jdimd70tHV3Hp8ArSEiMYPO4YOgc3pJTsXvkO4Vf2oFbb4OhahTrNn6VltzFmthePrOLCIcV26Phj2Dm4AXDv1gl2LB9DUPWqAHTr1o233nqrVNyWCAkJYcqUKWTn5vNkt2fpPWCw2fl7keEsmjuFO7eu8+Kro+nV75XCc28P74ednT1vjXmDFi1aoXXwZdMRPdEWOnD7t1fj5yEwGOFeomTrMQNGCf3aqagXKNBqBNsPpLJ8U9m9v8P6e/BkGyde/TgcAG3+OWKvLUKtlvR4+ll6PvuCWfioyLssmPONqX0ZQW9T+3IvKoLJ4z4mKSkBYXpvjh78Mi8891Sh7aETp/nxl3WohAq1WsXbw1+lcf26AJw4e55p8xeTkpqGnb09/foPYMAL5hq2+/fvZcO6tQDo7Ox4c8zbBAXVJD4+jlkzvyU5OYmoyMgrwOKwsLC5D7hFj8QO53qVcgh6pl0t11MSQlRH0YdraOHcMygbkz4NtAHmSSlbVyZNYJ2jY6VslvOA2fFePTvgUKs6B+p15+IbX9Jw/njlhEpF3dljmTD+KwadimHj5m2cPfon0ZG3zOwvnztMfPRdxn+3jVdGj+O3JcqmukaDgaz0VD6cvILV6//EydmFyIhwM1tHJ2eGj36H3v1eNDuemBDP9q0beGXwCGrXqUetRk9Qr3kXtq6YaDEPO9fO5LHug3l/2k7s7F0
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 576x432 with 2 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {
|
|||
|
|
"needs_background": "light"
|
|||
|
|
},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# To get a better look, let's run a correlation matrix\n",
|
|||
|
|
"df_corr = df.corr()\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Plot the results\n",
|
|||
|
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
|
"sns.heatmap(df_corr, annot=True, cmap='coolwarm', fmt='.2f')\n",
|
|||
|
|
"plt.title('Correlation Matrix')\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "756d9107",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## What types of machine learning models perform best on this dataset?"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 14,
|
|||
|
|
"id": "32f237e3",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
|
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 15,
|
|||
|
|
"id": "9846d182",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
|
"from sklearn.svm import SVC\n",
|
|||
|
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
|||
|
|
"from sklearn.linear_model import LogisticRegression\n",
|
|||
|
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
|||
|
|
"from sklearn.naive_bayes import GaussianNB\n",
|
|||
|
|
"from sklearn.ensemble import AdaBoostClassifier\n",
|
|||
|
|
"from sklearn.ensemble import GradientBoostingClassifier"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 17,
|
|||
|
|
"id": "90e82284",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Split the data into features (X) and labels (y)\n",
|
|||
|
|
"X = df.drop(columns=['class'])\n",
|
|||
|
|
"y = df['class']\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Split the data into training and testing sets\n",
|
|||
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Define a dictionary to store results\n",
|
|||
|
|
"results = {'Model': [], 'F1_score': [], 'Accuracy': [], 'Precision': [], 'Recall': []}"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 18,
|
|||
|
|
"id": "9a317b55",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"models = {\n",
|
|||
|
|
" 'Random Forest': RandomForestClassifier(),\n",
|
|||
|
|
" 'Support Vector Machine': SVC(),\n",
|
|||
|
|
" 'K-Nearest Neighbors': KNeighborsClassifier(),\n",
|
|||
|
|
" 'Logistic Regression': LogisticRegression(),\n",
|
|||
|
|
" 'Decision Tree': DecisionTreeClassifier(),\n",
|
|||
|
|
" 'Naive Bayes': GaussianNB(),\n",
|
|||
|
|
" 'AdaBoost': AdaBoostClassifier(),\n",
|
|||
|
|
" 'Gradient Boosting': GradientBoostingClassifier()\n",
|
|||
|
|
"}"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 19,
|
|||
|
|
"id": "b52ce7b0",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
|
|||
|
|
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
|
|||
|
|
"\n",
|
|||
|
|
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
|
|||
|
|
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
|
|||
|
|
"Please also refer to the documentation for alternative solver options:\n",
|
|||
|
|
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
|
|||
|
|
" n_iter_i = _check_optimize_result(\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"for model_name, model in models.items():\n",
|
|||
|
|
" # Train the model\n",
|
|||
|
|
" model.fit(X_train, y_train)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Make predictions\n",
|
|||
|
|
" y_pred = model.predict(X_test)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Evaluate the model\n",
|
|||
|
|
" f1 = f1_score(y_test, y_pred)\n",
|
|||
|
|
" accuracy = accuracy_score(y_test, y_pred)\n",
|
|||
|
|
" precision = precision_score(y_test, y_pred)\n",
|
|||
|
|
" recall = recall_score(y_test, y_pred)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Store results in the dictionary\n",
|
|||
|
|
" results['Model'].append(model_name)\n",
|
|||
|
|
" results['F1_score'].append(f1)\n",
|
|||
|
|
" results['Accuracy'].append(accuracy)\n",
|
|||
|
|
" results['Precision'].append(precision)\n",
|
|||
|
|
" results['Recall'].append(recall)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Create a DataFrame from the results dictionary\n",
|
|||
|
|
"results_df = pd.DataFrame(results)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 20,
|
|||
|
|
"id": "85dd14ef",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>Model</th>\n",
|
|||
|
|
" <th>F1_score</th>\n",
|
|||
|
|
" <th>Accuracy</th>\n",
|
|||
|
|
" <th>Precision</th>\n",
|
|||
|
|
" <th>Recall</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>Random Forest</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>Support Vector Machine</td>\n",
|
|||
|
|
" <td>0.992278</td>\n",
|
|||
|
|
" <td>0.992615</td>\n",
|
|||
|
|
" <td>0.998705</td>\n",
|
|||
|
|
" <td>0.985934</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>2</th>\n",
|
|||
|
|
" <td>K-Nearest Neighbors</td>\n",
|
|||
|
|
" <td>0.996178</td>\n",
|
|||
|
|
" <td>0.996308</td>\n",
|
|||
|
|
" <td>0.992386</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>3</th>\n",
|
|||
|
|
" <td>Logistic Regression</td>\n",
|
|||
|
|
" <td>0.945153</td>\n",
|
|||
|
|
" <td>0.947077</td>\n",
|
|||
|
|
" <td>0.942748</td>\n",
|
|||
|
|
" <td>0.947570</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>4</th>\n",
|
|||
|
|
" <td>Decision Tree</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>5</th>\n",
|
|||
|
|
" <td>Naive Bayes</td>\n",
|
|||
|
|
" <td>0.919671</td>\n",
|
|||
|
|
" <td>0.921846</td>\n",
|
|||
|
|
" <td>0.909887</td>\n",
|
|||
|
|
" <td>0.929668</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>6</th>\n",
|
|||
|
|
" <td>AdaBoost</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>7</th>\n",
|
|||
|
|
" <td>Gradient Boosting</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" <td>1.000000</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" Model F1_score Accuracy Precision Recall\n",
|
|||
|
|
"0 Random Forest 1.000000 1.000000 1.000000 1.000000\n",
|
|||
|
|
"1 Support Vector Machine 0.992278 0.992615 0.998705 0.985934\n",
|
|||
|
|
"2 K-Nearest Neighbors 0.996178 0.996308 0.992386 1.000000\n",
|
|||
|
|
"3 Logistic Regression 0.945153 0.947077 0.942748 0.947570\n",
|
|||
|
|
"4 Decision Tree 1.000000 1.000000 1.000000 1.000000\n",
|
|||
|
|
"5 Naive Bayes 0.919671 0.921846 0.909887 0.929668\n",
|
|||
|
|
"6 AdaBoost 1.000000 1.000000 1.000000 1.000000\n",
|
|||
|
|
"7 Gradient Boosting 1.000000 1.000000 1.000000 1.000000"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 20,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"results_df"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "55d00c78",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"- **F1 Score** balances precision and recall, providing a single metric that considers both false positives and false negatives.\n",
|
|||
|
|
"- **Accuracy** provides an overall measure of correct predictions but may not be suitable for imbalanced datasets.\n",
|
|||
|
|
"- **Precision** focuses on minimizing false positives, useful when the cost of false positives is high.\n",
|
|||
|
|
"- **Recall** focuses on minimizing false negatives, useful when the cost of false negatives is high."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "3b44321e",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"# Which features are most indicative of a poisonous mushroom?"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 29,
|
|||
|
|
"id": "c8620ac6",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Create an empty dictionary to store feature importance scores\n",
|
|||
|
|
"feature_importance = {}"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 30,
|
|||
|
|
"id": "e9e81c6c",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Iterate through each model in the 'models' dictionary\n",
|
|||
|
|
"for model_name, model in models.items():\n",
|
|||
|
|
" # Assuming RandomForestClassifier is used, adapt for other models\n",
|
|||
|
|
" if model_name == 'Random Forest':\n",
|
|||
|
|
" # Get feature importance from the trained model\n",
|
|||
|
|
" importances = model.feature_importances_\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Map feature names to their importance scores\n",
|
|||
|
|
" feature_importance[model_name] = dict(zip(X.columns, importances))"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 31,
|
|||
|
|
"id": "68387008",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Convert the dictionary to a DataFrame for better visualization\n",
|
|||
|
|
"feature_importance_df = pd.DataFrame(feature_importance).T"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 32,
|
|||
|
|
"id": "4653e9e9",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>bruises</th>\n",
|
|||
|
|
" <th>cap-color</th>\n",
|
|||
|
|
" <th>cap-shape</th>\n",
|
|||
|
|
" <th>cap-surface</th>\n",
|
|||
|
|
" <th>gill-attachment</th>\n",
|
|||
|
|
" <th>gill-color</th>\n",
|
|||
|
|
" <th>gill-size</th>\n",
|
|||
|
|
" <th>gill-spacing</th>\n",
|
|||
|
|
" <th>habitat</th>\n",
|
|||
|
|
" <th>odor</th>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <th>ring-type</th>\n",
|
|||
|
|
" <th>spore-print-color</th>\n",
|
|||
|
|
" <th>stalk-color-above-ring</th>\n",
|
|||
|
|
" <th>stalk-color-below-ring</th>\n",
|
|||
|
|
" <th>stalk-root</th>\n",
|
|||
|
|
" <th>stalk-shape</th>\n",
|
|||
|
|
" <th>stalk-surface-above-ring</th>\n",
|
|||
|
|
" <th>stalk-surface-below-ring</th>\n",
|
|||
|
|
" <th>veil-color</th>\n",
|
|||
|
|
" <th>veil-type</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>Random Forest</th>\n",
|
|||
|
|
" <td>0.05007</td>\n",
|
|||
|
|
" <td>0.013889</td>\n",
|
|||
|
|
" <td>0.004782</td>\n",
|
|||
|
|
" <td>0.008458</td>\n",
|
|||
|
|
" <td>0.002411</td>\n",
|
|||
|
|
" <td>0.113684</td>\n",
|
|||
|
|
" <td>0.146035</td>\n",
|
|||
|
|
" <td>0.047978</td>\n",
|
|||
|
|
" <td>0.033642</td>\n",
|
|||
|
|
" <td>0.134474</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>0.066536</td>\n",
|
|||
|
|
" <td>0.102494</td>\n",
|
|||
|
|
" <td>0.011799</td>\n",
|
|||
|
|
" <td>0.018804</td>\n",
|
|||
|
|
" <td>0.064496</td>\n",
|
|||
|
|
" <td>0.021023</td>\n",
|
|||
|
|
" <td>0.043097</td>\n",
|
|||
|
|
" <td>0.046428</td>\n",
|
|||
|
|
" <td>0.002559</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>1 rows × 22 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" bruises cap-color cap-shape cap-surface gill-attachment \\\n",
|
|||
|
|
"Random Forest 0.05007 0.013889 0.004782 0.008458 0.002411 \n",
|
|||
|
|
"\n",
|
|||
|
|
" gill-color gill-size gill-spacing habitat odor ... \\\n",
|
|||
|
|
"Random Forest 0.113684 0.146035 0.047978 0.033642 0.134474 ... \n",
|
|||
|
|
"\n",
|
|||
|
|
" ring-type spore-print-color stalk-color-above-ring \\\n",
|
|||
|
|
"Random Forest 0.066536 0.102494 0.011799 \n",
|
|||
|
|
"\n",
|
|||
|
|
" stalk-color-below-ring stalk-root stalk-shape \\\n",
|
|||
|
|
"Random Forest 0.018804 0.064496 0.021023 \n",
|
|||
|
|
"\n",
|
|||
|
|
" stalk-surface-above-ring stalk-surface-below-ring veil-color \\\n",
|
|||
|
|
"Random Forest 0.043097 0.046428 0.002559 \n",
|
|||
|
|
"\n",
|
|||
|
|
" veil-type \n",
|
|||
|
|
"Random Forest 0.0 \n",
|
|||
|
|
"\n",
|
|||
|
|
"[1 rows x 22 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 32,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"feature_importance_df.head()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": 34,
|
|||
|
|
"id": "ce94b8d5",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzoAAAHwCAYAAABuXRpPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABiGUlEQVR4nO3deZRdVZn38e+PgIQxqCBvADWoEQQSAhTIpILSTjiLoqIEJxpn2kYbG0WwtYWGbmkFRbQFVFQE0WZQAZFZpoQEAoraDUEFWkUxECYhPO8f9xReiqrKTapuVeXm+1nrrjp3nz0851StVD3Z++ybqkKSJEmSeskq4x2AJEmSJI02Ex1JkiRJPcdER5IkSVLPMdGRJEmS1HNMdCRJkiT1HBMdSZIkST3HREeSJE0oSS5K8q7xjkPSis1ER5LUkSQLk9yfZHHba6NR6HOP0Yqxg/EOS/LNsRpvOEn2S3LZeMcxUJNkPNB8fxcluSTJjFEe4wnN9+LXSe5tfg6+lmTaaI7TNt5JST7djb4lTVwmOpKkZfHKqlq77XX7eAaTZNXxHH95rQBxv7+q1gaeDFwEfGN5OhnmOk8HXgW8BZgCbA3MBV60POMsJYZJo92npBWDiY4kaUSSTEnyX0nuSHJbkk/3/3GZ5JlJfprkT0nuTHJKkvWac98Angac1cwefDTJbkl+N6D/R2d9mlmA05N8M8ndwH7Djd9B7JXkvc3Mwj1J/qWJ+Yokdyf5bpInNHV3S/K7JP/cXMvCJPsMuA9fT/LHJLcm+XiSVZpz+yW5PMnnkvwZOBU4Htipufa/NPX2TDKvGfu3SQ5r639aE+/sJL9pYjik7fykJrb/ba5lbpKnNuc2T3J+kj8n+WWSN3Zyf6rqYeA7wBZt4+zQ3J+/NPf82P571HZP35fk18CvB7nnewB/B7y6qq6pqoeralFVHVdV/9VW9enNPbsnyXlJ1m/r47Qk/9c247Rl27mTknwpyQ+T3Au8E9gH+Ghzr8/q5NolrfhMdCRJI3Uy8DDwLGAb4MVA//MVAT4LbAQ8B3gqcBhAVb0N+A1/myX6tw7HezWtGYH1gFOWMn4nXgpsB+wIfBQ4gdYfxk8FtgLe3Fb3/wHrAxsDs4ETkmzWnPsCrdmJZwAvAPYF3t7W9rnAzcBTgLcCBwBXNNe+XlPn3qbdesCewHuSvGZAvLsCm9Ga/Tg0yXOa8g83sb4cWBd4B3BfkrWA84FvNWO/Gfhie3IwlCaB2Qe4sq14CfAPzX3YqYnjvQOavqa53i14vD2Aq6vqt0sZ/i207t9TgCcAB7Wd+xEwvTl3La2fg4FtPwOsA3y9Of9vzb1+5VLGldQjTHQkScviB83/5P8lyQ+SbAi8DDiwqu6tqj8AnwPeBFBV/1NV51fVg1X1R+A/aCUBI3FFVf2gqh6h9Qf9kON36MiquruqbgRuAM6rqpurahGtP6i3GVD/E831XAycA7yxmUHaG/hYVd1TVQuBfwfe1tbu9qr6QjODcf9ggVTVRVW1oKoeqarrgW/z+Pt1eFXdX1XXAdfRWvYFreTu41X1y2q5rqr+BLwCWFhVJzZjXwt8D9hrmHvy+WaWaTHwfuDwthjnVtWVTV8LgS8PEuNnq+rPQ1znk4E7hhm734lV9aumj+8Cs9pi+Fpznx+klThvnWRKW9v/rqrLm/v4QAdjSepBE32NsCRpYnlNVf2k/02SHYDVgDuS9BevAvy2Of8U4PPA82j97/oqwF0jjKF9JuDpw43fod+3Hd8/yPv/1/b+rqq6t+39rbRmq9anNetw64BzGw8R96CSPBc4gtZM0hOA1YHTBlT7v7bj+4C1m+OnAv87SLdPB57bvzyusSrDP3fzwar6arP0bhfgzCQvqKrrkzybVsLaB6zZ9DV3QPvhrvVPwLOHOd9v0OtsksrPAG8ANgAeaeqsDyzqYHxJKwlndCRJI/Fb4EFg/apar3mtW1X9y6I+CxQws6rWpbVkK23ta0B/99L64xl49I/aDQbUaW+ztPFH2xObpWD9ngbcDtwJPEQrqWg/d9sQcQ/2HlrLy84EnlpVU2g9x5NB6g3mt8Azhyi/uO3+rNcs4XrP0jpsZkQuBf6H1pJAgC8BNwHTm+/pPw8S42DX1u8nwA5JNlna+EN4C63li3vQWio4rSkf7udquHgk9SgTHUnScquqO4DzgH9Psm6SVZqH+fuXMq1Da/nTX5JsDHxkQBe/p/VMS79fAZObh/JXAz5Oa1ZjecfvhsPT2h75ebSWhZ1WVUtoLa/6TJJ1kjyd1jMzw21l/Xtgk/YH+Wndrz9X1QPNbNlbliGurwL/kmR6WmYmeTJwNvDsJG9Lslrz2r7t2Z5hJdmJ1rM2N7bFeDewOMnmwFITpnbNjOD5wPeTbJdk1eaeHZDkHR10sQ6t5PZPtJLif+2gzcCfM0krARMdSdJI7UtrmdXPaS1LOx2Y2pw7HNiW1pKic4AzBrT9LPDx5pmfg5rnYt5L64/222jN8PyO4Q03/mj7v2aM22k94H5AVd3UnPsArXhvBi6jNTvztWH6+imt5OH/ktzZlL0X+FSSe4BDaSVPnfqPpv55tBKR/wLWqKp7aM3GvKmJ+/+AIxkmgQSObXYoW0xridvHq+pHzbmDaCVg9wBfobWD3LLaC/hh03YRrWej+mjN9izN12ktC7yN1vf8yuGrA617sUX/s2XLEa+kFVCqnM2VJGlpkuwGfLOqlnfJlSRpDDmjI0mSJKnnmOhIkiRJ6jkuXZMkSZLUc5zRkSRJktRzTHQkSZIk9ZxVxzsA9ab111+/pk2bNt5hSJIkqcfNnTv3zqoa+OHSJjrqjmnTpjFnzpzxDkOSJEk9Lsmtg5W7dE2SJElSzzHRkSRJktRzTHQkSZIk9Ryf0ZEkSdJK5aGHHuJ3v/sdDzzwwHiHomUwefJkNtlkE1ZbbbWO6pvoSJIkaaXyu9/9jnXWWYdp06aRZLzDUQeqij/96U/87ne/Y9NNN+2ojUvXJEmStFJ54IEHePKTn2ySswJJwpOf/ORlmoUz0ZEkSdJKxyRnxbOs3zMTHUmSJGmMTZo0iVmzZrHVVlvxyle+kr/85S+j0u9JJ53E+9///lHpq91uu+3GZpttxqxZs5g1axann376qI8BsHDhQr71rW+NSl8+oyNJkqSV2rSDzxnV/hYesedS66yxxhrMnz8fgNmzZ3PcccdxyCGHjGoco+2UU06hr69vmdo8/PDDrLpq5ylHf6Lzlre8ZVnDexxndCRJkqRxtNNOO3HbbbcBcPXVV7PzzjuzzTbbsPPOO/PLX/4SaM3UvO51r+OlL30p06dP56Mf/eij7U888USe/exn84IXvIDLL7/80fJbb72VF73oRcycOZMXvehF/OY3vwFgv/324z3veQ+77747z3jGM7j44ot5xzvewXOe8xz222+/juP+85//zGte8xpmzpzJjjvuyPXXXw/AYYcdxv7778+LX/xi9t13X/74xz/y+te/nu23357tt9/+0RgvvvjiR2eIttlmG+655x4OPvhgLr30UmbNmsXnPve5Ed1XZ3QkSZKkcbJkyRIuuOAC3vnOdwKw+eabc8kll7Dqqqvyk5/8hH/+53/me9/7HgDz589n3rx5rL766my22WZ84AMfYNVVV+WTn/wkc+fOZcqUKey+++5ss802ALz//e9n3333Zfbs2Xzta1/jgx/8ID/4wQ8AuOuuu/jpT3/KmWeeyStf+Uouv/xyvvrVr7L99tszf/58Zs2a9bhY99lnH9ZYYw0ALrjgAg477DC22WYbfvCDH/DTn/6Ufffd99FZqrlz53LZZZexxhpr8Ja3vIV/+Id/YNddd+U3v/kNL3nJS/jFL37B0UcfzXHHHccuu+zC4sWLmTx5MkcccQRHH300Z5999ojvrYmOJEmSNMbuv/9+Zs2axcKFC9luu+34u7/7OwAWLVrE7Nmz+fWvf00SHnrooUfbvOhFL2LKlCkAbLHFFtx6663ceeed7LbbbmywwQYA7L333vzqV78C4IorruCMM84A4G1ve9tjZoFe+cpXkoQZM2aw4YYbMmPGDAC23HJ
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 864x576 with 1 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {
|
|||
|
|
"needs_background": "light"
|
|||
|
|
},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Plot the feature importance for each feature / column\n",
|
|||
|
|
"plt.figure(figsize=(12, 8))\n",
|
|||
|
|
"for model_name, model_importance in feature_importance.items():\n",
|
|||
|
|
" plt.barh(list(model_importance.keys()), list(model_importance.values()), label=model_name)\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.xlabel('Feature Importance')\n",
|
|||
|
|
"plt.ylabel('Features / Columns')\n",
|
|||
|
|
"plt.title('Feature Importance Bar Chart')\n",
|
|||
|
|
"plt.legend()\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "6e08cfac",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": []
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"metadata": {
|
|||
|
|
"kernelspec": {
|
|||
|
|
"display_name": "Python 3 (ipykernel)",
|
|||
|
|
"language": "python",
|
|||
|
|
"name": "python3"
|
|||
|
|
},
|
|||
|
|
"language_info": {
|
|||
|
|
"codemirror_mode": {
|
|||
|
|
"name": "ipython",
|
|||
|
|
"version": 3
|
|||
|
|
},
|
|||
|
|
"file_extension": ".py",
|
|||
|
|
"mimetype": "text/x-python",
|
|||
|
|
"name": "python",
|
|||
|
|
"nbconvert_exporter": "python",
|
|||
|
|
"pygments_lexer": "ipython3",
|
|||
|
|
"version": "3.9.12"
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
"nbformat": 4,
|
|||
|
|
"nbformat_minor": 5
|
|||
|
|
}
|