Files
Data_ScienceUse_Cases/Classification/Mushrooms_Classification.ipynb

1314 lines
457 KiB
Plaintext
Raw Permalink Normal View History

2023-12-01 15:23:22 +07:00
{
"cells": [
{
"cell_type": "markdown",
"id": "a789c814",
"metadata": {},
"source": [
"# Load Basic Libraries\n",
" Load some libaries to read and display the data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5d072c3f",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "947c545a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>class</th>\n",
" <th>cap-shape</th>\n",
" <th>cap-surface</th>\n",
" <th>cap-color</th>\n",
" <th>bruises</th>\n",
" <th>odor</th>\n",
" <th>gill-attachment</th>\n",
" <th>gill-spacing</th>\n",
" <th>gill-size</th>\n",
" <th>gill-color</th>\n",
" <th>...</th>\n",
" <th>stalk-surface-below-ring</th>\n",
" <th>stalk-color-above-ring</th>\n",
" <th>stalk-color-below-ring</th>\n",
" <th>veil-type</th>\n",
" <th>veil-color</th>\n",
" <th>ring-number</th>\n",
" <th>ring-type</th>\n",
" <th>spore-print-color</th>\n",
" <th>population</th>\n",
" <th>habitat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>p</td>\n",
" <td>x</td>\n",
" <td>s</td>\n",
" <td>n</td>\n",
" <td>t</td>\n",
" <td>p</td>\n",
" <td>f</td>\n",
" <td>c</td>\n",
" <td>n</td>\n",
" <td>k</td>\n",
" <td>...</td>\n",
" <td>s</td>\n",
" <td>w</td>\n",
" <td>w</td>\n",
" <td>p</td>\n",
" <td>w</td>\n",
" <td>o</td>\n",
" <td>p</td>\n",
" <td>k</td>\n",
" <td>s</td>\n",
" <td>u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>e</td>\n",
" <td>x</td>\n",
" <td>s</td>\n",
" <td>y</td>\n",
" <td>t</td>\n",
" <td>a</td>\n",
" <td>f</td>\n",
" <td>c</td>\n",
" <td>b</td>\n",
" <td>k</td>\n",
" <td>...</td>\n",
" <td>s</td>\n",
" <td>w</td>\n",
" <td>w</td>\n",
" <td>p</td>\n",
" <td>w</td>\n",
" <td>o</td>\n",
" <td>p</td>\n",
" <td>n</td>\n",
" <td>n</td>\n",
" <td>g</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>e</td>\n",
" <td>b</td>\n",
" <td>s</td>\n",
" <td>w</td>\n",
" <td>t</td>\n",
" <td>l</td>\n",
" <td>f</td>\n",
" <td>c</td>\n",
" <td>b</td>\n",
" <td>n</td>\n",
" <td>...</td>\n",
" <td>s</td>\n",
" <td>w</td>\n",
" <td>w</td>\n",
" <td>p</td>\n",
" <td>w</td>\n",
" <td>o</td>\n",
" <td>p</td>\n",
" <td>n</td>\n",
" <td>n</td>\n",
" <td>m</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>p</td>\n",
" <td>x</td>\n",
" <td>y</td>\n",
" <td>w</td>\n",
" <td>t</td>\n",
" <td>p</td>\n",
" <td>f</td>\n",
" <td>c</td>\n",
" <td>n</td>\n",
" <td>n</td>\n",
" <td>...</td>\n",
" <td>s</td>\n",
" <td>w</td>\n",
" <td>w</td>\n",
" <td>p</td>\n",
" <td>w</td>\n",
" <td>o</td>\n",
" <td>p</td>\n",
" <td>k</td>\n",
" <td>s</td>\n",
" <td>u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>e</td>\n",
" <td>x</td>\n",
" <td>s</td>\n",
" <td>g</td>\n",
" <td>f</td>\n",
" <td>n</td>\n",
" <td>f</td>\n",
" <td>w</td>\n",
" <td>b</td>\n",
" <td>k</td>\n",
" <td>...</td>\n",
" <td>s</td>\n",
" <td>w</td>\n",
" <td>w</td>\n",
" <td>p</td>\n",
" <td>w</td>\n",
" <td>o</td>\n",
" <td>e</td>\n",
" <td>n</td>\n",
" <td>a</td>\n",
" <td>g</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" class cap-shape cap-surface cap-color bruises odor gill-attachment \\\n",
"0 p x s n t p f \n",
"1 e x s y t a f \n",
"2 e b s w t l f \n",
"3 p x y w t p f \n",
"4 e x s g f n f \n",
"\n",
" gill-spacing gill-size gill-color ... stalk-surface-below-ring \\\n",
"0 c n k ... s \n",
"1 c b k ... s \n",
"2 c b n ... s \n",
"3 c n n ... s \n",
"4 w b k ... s \n",
"\n",
" stalk-color-above-ring stalk-color-below-ring veil-type veil-color \\\n",
"0 w w p w \n",
"1 w w p w \n",
"2 w w p w \n",
"3 w w p w \n",
"4 w w p w \n",
"\n",
" ring-number ring-type spore-print-color population habitat \n",
"0 o p k s u \n",
"1 o p n n g \n",
"2 o p n n m \n",
"3 o p k s u \n",
"4 o e n a g \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/main/Classification/Data/mushrooms.csv\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "ed66f9da",
"metadata": {},
"source": [
"# Data Reformatting\n",
"Here we can see all the data types are textual (object). We need to convert all of this as numerics (in this case, integers)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "64fa324d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Column_Name</th>\n",
" <th>Data_Type</th>\n",
" <th>Missing_Data</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>class</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>cap-shape</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>cap-surface</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cap-color</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>bruises</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>odor</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>gill-attachment</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>gill-spacing</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>gill-size</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>gill-color</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>stalk-shape</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>stalk-root</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>stalk-surface-above-ring</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>stalk-surface-below-ring</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>stalk-color-above-ring</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>stalk-color-below-ring</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>veil-type</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>veil-color</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>ring-number</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>ring-type</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>spore-print-color</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>population</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>habitat</td>\n",
" <td>object</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Column_Name Data_Type Missing_Data\n",
"0 class object 0\n",
"1 cap-shape object 0\n",
"2 cap-surface object 0\n",
"3 cap-color object 0\n",
"4 bruises object 0\n",
"5 odor object 0\n",
"6 gill-attachment object 0\n",
"7 gill-spacing object 0\n",
"8 gill-size object 0\n",
"9 gill-color object 0\n",
"10 stalk-shape object 0\n",
"11 stalk-root object 0\n",
"12 stalk-surface-above-ring object 0\n",
"13 stalk-surface-below-ring object 0\n",
"14 stalk-color-above-ring object 0\n",
"15 stalk-color-below-ring object 0\n",
"16 veil-type object 0\n",
"17 veil-color object 0\n",
"18 ring-number object 0\n",
"19 ring-type object 0\n",
"20 spore-print-color object 0\n",
"21 population object 0\n",
"22 habitat object 0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Missing data check function\n",
"def completeness_check(input_df):\n",
" # Create a new DataFrame\n",
" summary_df = pd.DataFrame(columns=['Column_Name', 'Data_Type', 'Missing_Data'])\n",
"\n",
" # Fill in the data\n",
" summary_df['Column_Name'] = input_df.columns\n",
" summary_df['Data_Type'] = input_df.dtypes.values\n",
" summary_df['Missing_Data'] = input_df.isnull().sum().values\n",
"\n",
" return summary_df\n",
"\n",
"completeness_check(df)"
]
},
{
"cell_type": "markdown",
"id": "737ffd3b",
"metadata": {},
"source": [
"Here I am using my own script to display either a histogram or bar chart to the whole dataset."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c571ce47",
"metadata": {},
"outputs": [],
"source": [
"import wget"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "aa20d4af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
" 0% [ ] 0 / 1700\r",
"100% [................................................................................] 1700 / 1700"
]
},
{
"data": {
"text/plain": [
"'data_desc_graph.py'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wget.download('https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/main/Scripts/data_desc_graph.py',\n",
" 'data_desc_graph.py')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f09fc246",
"metadata": {},
"outputs": [],
"source": [
"from data_desc_graph import data_desc_graph"
]
},
{
"cell_type": "markdown",
"id": "0126adc1",
"metadata": {},
"source": [
"Execution of code"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "14f49813",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABDAAAAsKCAYAAABAPfo0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOzde7ylZV3//9dbQEAFARkQZoAhxRKoRpkIswN5CLISrLCxEuyrjiKmlpWMVqI5aqViaGJ4CPCEk1oQggYo+rM4OCKCgATKaWCEUUQGDyjj5/fHfe1msWftPXvP7L3X2nu/no/H/VhrXffpuu+11rXu9bmvQ6oKSZIkSZKkYfaQQWdAkiRJkiRpcwxgSJIkSZKkoWcAQ5IkSZIkDT0DGJIkSZIkaegZwJAkSZIkSUPPAIYkSZIkSRp6BjCkzUhSE5hubsuenmTNBLb5vLbe4unO/9ZIcnGSi7dgvV2SnJTkidOQrSnX8jorx5Qe7zOX5Gntc3b4qOVvnuQ+TkrylK3K6CyW5ElJLkvyvXY+l0zjvm5u+3h9n3lvmMWf02kr85IcPvpzPsZyJ40qt9cnuSHJh5McsZX7PynJtF9T9Xw+Rqb7klyV5E+TZLr335OPfZO8s52/H7Z8fDHJa5I8si2zuOXxBTOQnyXtPdhtCrf56iS3JnkgyZVTtd1x9ndgkn9NckuS+5N8N8n/l+RlSXaY5LZm7DO5tabz93ei5U77XRz5Tv2knftrk7wvyZO2Yv9HJ/nzLV1fUn9DX7BJQ+BJo6ZvAp8elfasSW7zk229tVOXzWnxkjZN1i7Aa4FZEcCYZ/6OyX9eXwvM2wAG8D5gW+B36L63/zsD+/yzJAtmYD8zZZjKvF+my8tRwJuBXYFPJfnAFv7hO5zuOzJT11S9vz+/C1wMnALMyB+lJL8KXAX8Rtvvb7Z8/CdwAnDSTORjlCV078GUBDCSHAqsBM4CfhV47lRsd5z9HQNcAfwsXRn9G8BzgP8BXge8aJKbPJyZ/UxujffSfZYHbR1dPn6J7vP8TuDxwP8kedMWbvNoZuh7Kc0n2w46A9Kwq6pLe18nuR/41uj0SW5zHd2P5VBKsn1V3V9V1w46L5szktdB52O2qKqvDzoPkzHo97f9of1pYGVVfWYKthdgu6r60TiLfQ44DDgReOXW7nOQkmwHPDBkZd5lVfVAz+v3Jfkz4G3AlcBbB5KriRv9+/Nfrbbbs5mCvPe8Z5vcFU+yK/Ax4DrgaVX1vVH5eCvdH8AZkWQbYDpqnjy+Pb67qr6xtRsbrxxLcgBwJnA+cMyoz+Z5Sd4CPG5r8zBseq4z1gCbrbk6A3406nt1UZJTgZOBE5OsrqqPDyhvknrMhsisNOskeUKr+vn9VsX2xaPmb1KtMckfJvlyq4r73SRXJxn3rkuSxyX59yR3tWq8tyb5tyTb9iyzIMm7ktzWqqXe1u40bt/mj1SrPjjJp5PcB6xq8x7UhCQbq2r/Xqty+Z0k9yb5UJJHtWUWAze1Vd7TUy3zeeMcx+lJ1iT5pVYN+Yfpqkr/6Rjn7Vfbcd4DXNbm7ZyuSvMd7TivT/Jn7Q9jv/fmh0luT/I3jLoAzsaqz88bld63qnqSZyX57/be3Zvk8iTP7Jm/bZIVSb7W8nZHkremp1pwW+bvkny95e1bSb6Q5JfHOm9bIqOakGxuv9lYtfc1Pe/lST3r/3GSr/Ss+4Eke43a58OSnJrk2+mq7f97e68fdI57PgdPSvI/SX4A/EObtyzJZ5Ksa+f5y0mO63N8la6pxSvTVcX+XpJPJtmjTavSfb9uS/KqzZyr5wEb6H4r/yY9zcUmcew3J/lgkv+X5GvAj4DfGm+/dBfzpwIvSbJwM3l80PvR0jb5/Pac26Uj57Z9R36rzf/zltd7k5ydUbU/JvgZHtnvS5L8Q5I7gPuBXTJGVe4kL0xyRcvPd5J8Lskv9cx/XZv/3XaOP5PksM2cv0mrqpOBLwOv6Nn3DklOTvLV9pn7ZpL/TPIzPcucRHenG+DHI9+Rmc4/cC+wXW9CkpcmuSTJ3UnuSXLpyPvds8yY79kY+3khsAD401HBCwCq6ntVdcGo5G2SvD7J2paP/0yyaFQ+JvP9XpnkxCQ30X2f/hT417bIDdlYTi1u67w8yXU9n7HVScashZbuN+/09vLrvd+xTOB3Jht/J343yXuSrAPuHGt/wJ/R3VB8yajgBdDd8Kiq/27bnorP5MOS/H2Sm5L8qD2+JqNqHyV5Yjb+Vt6WrknN6zKqucfWnpP0aUKSrrx5VbpmHD9sn4tPjRznRM7DVGhBvL9qeX1FT/4WJPmXJP+b7jrvtnRN0Rb2LHM6cBywsOczefNM5l+aq6yBIU29nYEPA28HXg/8CXBqkuur6rP9Vkj3Z/GDdNVx/5LuD9PPMPZF5IhzgXuA44FvAQuBZ7T1R+6W/Q9dtdo30FX73YOu6vRD6S5UR5xNV1X+74GfbGa/bwcupKviegDwRmBv4Nfpqoj/LvAJ4E3AOW2dzd353xn4aNv/jcAy4JQk66vq9FHLfgj4CPD7wLbtwuuTdE1W/ha4mu5P4tvoLrZf3c7H7sBn6JoBHdeO/y+BfTeTtzGlC7KcAvxH2+Z9LR+Lexb7IF3zg7+nez8eT1dNeDHwe22ZV9FdyL6G7i7wzsBSJlglOj1Bqx4TCVJvbr9PAi6hu6D/l5a2pu1zeUv7KLCC7jPwRuAXkzyxqu5ry58GHENXtXw18FS697CfR9JV234L3fv2g5b+U3R3ft9M9/n8VeC9SXasqneP2sZzga/SNX/ak+7zeiawE91dzpH8vDnJ1VV13hh5+SRdc4Mv0H033kv7zkzi2KH7Xiyhqwp+F3DzGPvr9UbgBcDfAC/ezLITtTPdeXgLcAfde/7xJP9Md4f3BDaer3+mu6M/YiKf4RGvAb4ILAe2AX7YLzPp7iy/ku7cvpbufT2M7vv4P22xhXR3QNcADwf+GPh8kqVVddWkz8D4zgdenWTfqroV2J7uM/MGunJtN7rP1KVJfqaqvkn3mVgEPJ/us7Jh1DanI//p+b7vRPe+PB3461HLLW75u5mNTaDOTfKMqjp/1LITes+ApwHfrKrVk8jvCrr38//R/f68le77/2s9y0zm+/084BvAXwDfows87UZ3/Mew8W7+2iR/1Pb3euD/A3YEfo7xy9WX0L1PK+h+y9YCayb6O9PjHXSfqecC4/Vh8TTgi1U1keZVW/WZbJ+bTwMH0n1/r6b7zv1N29Yr23K7AxfRlRPH0gWK/owH/64xjefkLLrmF2+nu9bYge4zsRfwtQmehylRVT9KchHw+0m2bUGm3ei+IyvoapbtTXfu/rvt/4d053cB8AvAyA2NkWuuGcu/NCdVlZOT0yQmuovBD44x73SggF/vSdueLrhwWk/a89pyi9vrvwDunmQ+dm/beOY4y7ye7uLlCeMsc1Lbzsv7zLsYuLjn9eFt2U+NWu6PWvpT2+vF7fULJngsI+dt2aj0C4BbgIw6byePWu63W/rzRqWP/OHcvb1eSXchtm/PMg9v70/1pC0eY3sjx394e70zsB74xDjH9ittnWPHOGdL2utzx9vOBM7deNPho5a/uef1ZvfbtvGGUWnb0N2V+uyo9F9uy7+svf5puj8kfzVquVNGn+OeYzlqM/l5CN0fsvcAX+mT1/8Ftu1Je1tL/+uetG3pggn/upl9bdvWPWmyx97Sbga+Dzx6gu/nzbTyhe4C+EfAY9rrN/R+TnuO96RRaZt8fnvO7a/2pP1cS7se2GbU+frxSNokPsMj+72C9p3tWfZ5PLjMeyxd2fS2SXzWt2nvx/XAP431vRxn/ZPactuOMf9Fbf4vjrP/h9F95/9sotvdXP4nM7XPR7/v+Gmjz/kY35n/As7u81nZ5D0bYzvXAZdMMK8j2/7cqPS/aOl7byavY32/7wB2HOPz9dhR6e8ErtiC8/yC3s9rS5vo78zI5/HfJ7ivHwAf2cLPw6Q+k3SBgweVAy39NXRlzR7t9Rvb60U9y+xIV+7VVJ6Tkbz2vH4Ko8rRrTgPI5+LxZtZ/3RgzTjz39S2s+c4+9+
"text/plain": [
"<Figure size 1080x2880 with 24 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Dependencies Satisfied'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_desc_graph(df)"
]
},
{
"cell_type": "markdown",
"id": "efbc9d2d",
"metadata": {},
"source": [
"## Encoding the unique values from the dataset"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "54a1fd3b",
"metadata": {},
"outputs": [],
"source": [
"# Using LabelEncoder function from sklearn\n",
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "27f56a20",
"metadata": {},
"outputs": [],
"source": [
"# Iterate every column\n",
"for column in df.columns:\n",
" le = LabelEncoder()\n",
" df[column] = le.fit_transform(df[column])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "caed560f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>class</th>\n",
" <th>cap-shape</th>\n",
" <th>cap-surface</th>\n",
" <th>cap-color</th>\n",
" <th>bruises</th>\n",
" <th>odor</th>\n",
" <th>gill-attachment</th>\n",
" <th>gill-spacing</th>\n",
" <th>gill-size</th>\n",
" <th>gill-color</th>\n",
" <th>...</th>\n",
" <th>stalk-surface-below-ring</th>\n",
" <th>stalk-color-above-ring</th>\n",
" <th>stalk-color-below-ring</th>\n",
" <th>veil-type</th>\n",
" <th>veil-color</th>\n",
" <th>ring-number</th>\n",
" <th>ring-type</th>\n",
" <th>spore-print-color</th>\n",
" <th>population</th>\n",
" <th>habitat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" class cap-shape cap-surface cap-color bruises odor gill-attachment \\\n",
"0 1 5 2 4 1 6 1 \n",
"1 0 5 2 9 1 0 1 \n",
"2 0 0 2 8 1 3 1 \n",
"\n",
" gill-spacing gill-size gill-color ... stalk-surface-below-ring \\\n",
"0 0 1 4 ... 2 \n",
"1 0 0 4 ... 2 \n",
"2 0 0 5 ... 2 \n",
"\n",
" stalk-color-above-ring stalk-color-below-ring veil-type veil-color \\\n",
"0 7 7 0 2 \n",
"1 7 7 0 2 \n",
"2 7 7 0 2 \n",
"\n",
" ring-number ring-type spore-print-color population habitat \n",
"0 1 4 2 3 5 \n",
"1 1 4 3 2 1 \n",
"2 1 4 3 2 3 \n",
"\n",
"[3 rows x 23 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "839045a6",
"metadata": {},
"outputs": [],
"source": [
"# Run this code again to check if all the values have changed\n",
"# data_desc_graph(df)"
]
},
{
"cell_type": "markdown",
"id": "871a7563",
"metadata": {},
"source": [
"# Research Questions:\n",
"- What types of machine learning models perform best on this dataset?\n",
"- Which features are most indicative of a poisonous mushroom?"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "770f251e",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "markdown",
"id": "322b03d4",
"metadata": {},
"source": [
"*Note that veil-type only contain 1 unique values."
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "36d861f1",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAHlCAYAAAD8yFanAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOydd3gU1frHP2dLsum9kJAAoYTem4KAdBWliQ0EpKrYvVYU6QrSBaUpSFGpUpVeQu+hE2pIAum9J7t7fn/MpmyyIYF478/r3c/z5Ekyc945Zc6ceee0r5BSYsWKFStWrFix8k9E9f+dACtWrFixYsWKlX8XVkfHihUrVqxYsfKPxeroWLFixYoVK1b+sVgdHStWrFixYsXKPxaro2PFihUrVqxY+cdidXSsWLFixYoVK/9YrI6OFStW/nYIIYYKIQ5Xwv5PIcSQvzJN/2mEEIFCiAwhhPr/Oy1WrPw3Y3V0rFixYhEhxCtCiNOml220yXlo//+drpIIIcYLIVYVPyalfEpK+fO/Ia7lQggphHiuxPE5puNDK3idcCFE1weFkVJGSCkdpZSGSiTZipX/eayOjhUrVkohhPgAmANMBXyAQOB7oPcjXEtTkWP/RVwHCnuLTHkZANz6qyL4Ly8fK1b+VlgdHStWrJghhHABJgJjpJQbpZSZUsp8KeVWKeVHpjC2pl6M+6afOUIIW9O5TkKIKCHEJ0KIGGCZqddlvRBilRAiDRgqhHARQvxo6i26J4SYXNYwjRBirhAiUgiRJoQ4I4R4wnS8J/A58KKp5+m86fgBIcQI098qIcQXQoi7Qog4IcQKUx4RQlQ39cQMEUJECCEShBBjyymirUA7IYSb6f+ewAUgplh6awoh9gkhEk3XXC2EcDWdW4niOG41pfnjYukYLoSIAPYVO6YRQribyvRZ0zUchRA3hRCDH+LWWrHyP4nV0bFixUpJHgN0wO8PCDMWaAs0BZoArYEvip33BdyBasAo07HewHrAFVgN/AzogVpAM6A7MKKM+E6Z4nIHfgHWCSF0UsodKL1Oa0zDPE0s2A41/TwJBAGOwPwSYdoDwUAXYJwQot4D8p4DbAFeMv0/GFhRIowAvgb8gHpAADAeQEr5KhABPGtK8/Ridh1N4XsUv5iUMgkYBiwRQngDs4FQKWXJeK1YsVICq6NjxYqVkngACVJK/QPCDAQmSinjpJTxwATg1WLnjcBXUspcKWW26dgxKeUmKaURcAaeAt4z9RjFoby8X8ICUspVUspEKaVeSjkTsEVxTCrCQGCWlPK2lDID+Ax4qcTw0AQpZbaU8jxwHsV5exArgMGmnqGOwKYS6b0ppdxtyn88MMsUrjzGm8oju+QJKeUuYB2wF3gGGF2B61mx8j+PdRzYihUrJUkEPIUQmgc4O37A3WL/3zUdKyBeSplTwiay2N/VAC0QLYQoOKYqEaYQIcSHKL09foBEcZQ8y89KmWnVoMw9KiCm2N9ZKL0+ZSKlPCyE8ELpxdompcwulg9MvS7zgCcAJ5S8JVcgrRbzX4zFwFvAVCllYgWuZ8XK/zzWHh0rVqyU5BjK8EyfB4S5j+KsFBBoOlaAtGBT/FgkkAt4SildTT/OUsoGJY1M83E+AV4A3KSUrkAqyvBQWXGVl1Y9EFuOXXmsAj6k9LAVKMNWEmgspXQGBlGUXig7zWXmxTR/aZEpvjeEELUeJdFWrPyvYXV0rFixYoaUMhUYBywQQvQRQtgLIbRCiKeEEAXzSX4FvhBCeAkhPE3hV5V1TQtxRAO7gJlCCGfThOGaQghLwztOKI5JPKARQoxD6dEpIBaoLoQoqz37FXhfCFFDCOFI0ZyeBw3NVYR5QDcgpIw0ZwApQgh/4KMS52NR5gs9DJ+bfg8DZgArrHvsWLFSPlZHx4oVK6WQUs4CPkAZmolH6YF5i6K5KJOB0yirjS4CZ03HHobBgA1wBWVYZz1QxUK4ncCfKMu676L0NhUf4lln+p0ohDhrwf4nYCWKQ3LHZP/2Q6a1FFLKJCnlXimlpV6YCUBzlJ6n7cDGEue/RnEUU4QQ/yovLiFEC5T7Mdi0r840lN6fTyuTBytW/hcQlp9RK1asWLFixYqV/36sPTpWrFixYsWKlX8sVkfHihUrVqxYsfJvRwjxk2nTzktlnBdCiHmmzTAvCCGa/xXxWh0dK1asWLFixcp/guUoO4mXxVNAbdPPKOCHvyJSq6NjxYoVK1asWPm3I6UMAZIeEKQ3sEIqHAdchRCWFig8FFZHx4oVK1asWLHyd8Af8xWVUaZjlcK6M7KVB7JdG1ypZXm2Zy48sm0V+wc5/uVzOd6n/EBl8MPsU5WK+5VRbR7ZtmlASqXi/uG3vEe2/b7h6krF3W1Fy0e2HTTmiUrFHbL/XqXs7RxtH9lWZ6etVNw1gpzLD1QG6kp+rr73nCg/UBmcvV65zZk3H3/0fF8+U94m0g9m/Nv2j2zrlV+5upZmU9FNvUtTu2a1R79h5VDZ9r6X/vpoirTtABZLKRc/xCUs5a3SS8Otjo4VK1asWLFiBaGtnA8l8+ViFJmSRyUKRQC3gKqY77j+SFj30fkvRQgxHsiQUs74N0XxE9Ar/dJ1r5Bmz1oMUH/2WLx7dsSQncP54Z+Sdu4KAF7dnyBlYA9mLF1InhS07tSP7n2Hm9lKKVm3bBqXzx7CxlbHq2MmERhUH4BPRnQiOzMdgcTZxZWlK9ab2UZF3mX+nGncvnmDVwYPp0//Ih3Irb+vY/fObaSkJJGXZ8DV05/+I7/Gr3opZQGS46NY+8OHZGWm4FetPv1HTUOjseHq2b3UdMugQ/vW5OTkMvW7K9yN9Spl/8mbQTSp70RmlgGAbxbc5mZ4Jmn3V+FkE4ajg46xX31DjLEBienmtikJkWz76QNyslLxCajP00Omo9bYEHZ2B7tWf4pBr8fZxY2Pxs8hoFrNQrv7UeEsnjuJ8FthDHj1dZ7pOwiAvLxcJn/2OqNGvEaLFi3RGzWs2KEhIqb05r+vPedE9SoaEILYRD0/bkonN1/ioD/N9ZNzMOr1PP9cL4Z1bYvh2olCu1O37/Heyp34uzsB0Ll+DV7vUtSDo27ckWhbH3Jy9Uz7PpHrtzNLxf35O3Vo0tCFzEwlXVPnXefmnUzatXJDnbeJY0cOotPpeGbIN2hcS9+zcwdXcfbAz6QmRPDG18ewc3QH4PFgiZs2kazMVMZNmE2OY38c3OqY2Y7s50qNqjYIICZBz8L1yeTmSexsBVPf8cbNWdlk+M+jWWw5VEpTkyHPOFK9ikaxTzKwbGs6ufnw9gtONAiywSghMcXAr3tyuH3fYGb7Snc7An3UCCAu2cjKnVnk5SvPgSFiLjcuHcZWp6PTC1+jdimd70tHV3Hp8ArSEiMYPO4YOgc3pJTsXvkO4Vf2oFbb4OhahTrNn6VltzFmthePrOLCIcV26Phj2Dm4AXDv1gl2LB9DUPWqAHTr1o233nqrVNyWCAkJYcqUKWTn5vNkt2fpPWCw2fl7keEsmjuFO7eu8+Kro+nV75XCc28P74ednT1vjXmDFi1aoXXwZdMRPdEWOnD7t1fj5yEwGOFeomTrMQNGCf3aqagXKNBqBNsPpLJ8U9m9v8P6e/BkGyde/TgcAG3+OWKvLUKtlvR4+ll6PvuCWfioyLssmPONqX0ZQW9T+3IvKoLJ4z4mKSkBYXpvjh78Mi8891Sh7aETp/nxl3WohAq1WsXbw1+lcf26AJw4e55p8xeTkpqGnb09/foPYMAL5hq2+/fvZcO6tQDo7Ox4c8zbBAXVJD4+jlkzvyU5OYmoyMgrwOKwsLC5D7hFj8QO53qVcgh6pl0t11MSQlRH0YdraOHcMygbkz4NtAHmSSlbVyZNYJ2jY6VslvOA2fFePTvgUKs6B+p15+IbX9Jw/njlhEpF3dljmTD+KwadimHj5m2cPfon0ZG3zOwvnztMfPRdxn+3jVdGj+O3JcqmukaDgaz0VD6cvILV6//EydmFyIhwM1tHJ2eGj36H3v1eNDuemBDP9q0beGXwCGrXqUetRk9Qr3kXtq6YaDEPO9fO5LHug3l/2k7s7F0
"text/plain": [
"<Figure size 576x432 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# To get a better look, let's run a correlation matrix\n",
"df_corr = df.corr()\n",
"\n",
"# Plot the results\n",
"plt.figure(figsize=(8, 6))\n",
"sns.heatmap(df_corr, annot=True, cmap='coolwarm', fmt='.2f')\n",
"plt.title('Correlation Matrix')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "756d9107",
"metadata": {},
"source": [
"## What types of machine learning models perform best on this dataset?"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "32f237e3",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "9846d182",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.ensemble import GradientBoostingClassifier"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "90e82284",
"metadata": {},
"outputs": [],
"source": [
"# Split the data into features (X) and labels (y)\n",
"X = df.drop(columns=['class'])\n",
"y = df['class']\n",
"\n",
"# Split the data into training and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Define a dictionary to store results\n",
"results = {'Model': [], 'F1_score': [], 'Accuracy': [], 'Precision': [], 'Recall': []}"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "9a317b55",
"metadata": {},
"outputs": [],
"source": [
"models = {\n",
" 'Random Forest': RandomForestClassifier(),\n",
" 'Support Vector Machine': SVC(),\n",
" 'K-Nearest Neighbors': KNeighborsClassifier(),\n",
" 'Logistic Regression': LogisticRegression(),\n",
" 'Decision Tree': DecisionTreeClassifier(),\n",
" 'Naive Bayes': GaussianNB(),\n",
" 'AdaBoost': AdaBoostClassifier(),\n",
" 'Gradient Boosting': GradientBoostingClassifier()\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b52ce7b0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
}
],
"source": [
"for model_name, model in models.items():\n",
" # Train the model\n",
" model.fit(X_train, y_train)\n",
"\n",
" # Make predictions\n",
" y_pred = model.predict(X_test)\n",
"\n",
" # Evaluate the model\n",
" f1 = f1_score(y_test, y_pred)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" precision = precision_score(y_test, y_pred)\n",
" recall = recall_score(y_test, y_pred)\n",
"\n",
" # Store results in the dictionary\n",
" results['Model'].append(model_name)\n",
" results['F1_score'].append(f1)\n",
" results['Accuracy'].append(accuracy)\n",
" results['Precision'].append(precision)\n",
" results['Recall'].append(recall)\n",
"\n",
"# Create a DataFrame from the results dictionary\n",
"results_df = pd.DataFrame(results)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "85dd14ef",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Model</th>\n",
" <th>F1_score</th>\n",
" <th>Accuracy</th>\n",
" <th>Precision</th>\n",
" <th>Recall</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Random Forest</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Support Vector Machine</td>\n",
" <td>0.992278</td>\n",
" <td>0.992615</td>\n",
" <td>0.998705</td>\n",
" <td>0.985934</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>K-Nearest Neighbors</td>\n",
" <td>0.996178</td>\n",
" <td>0.996308</td>\n",
" <td>0.992386</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Logistic Regression</td>\n",
" <td>0.945153</td>\n",
" <td>0.947077</td>\n",
" <td>0.942748</td>\n",
" <td>0.947570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Decision Tree</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Naive Bayes</td>\n",
" <td>0.919671</td>\n",
" <td>0.921846</td>\n",
" <td>0.909887</td>\n",
" <td>0.929668</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>AdaBoost</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Gradient Boosting</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Model F1_score Accuracy Precision Recall\n",
"0 Random Forest 1.000000 1.000000 1.000000 1.000000\n",
"1 Support Vector Machine 0.992278 0.992615 0.998705 0.985934\n",
"2 K-Nearest Neighbors 0.996178 0.996308 0.992386 1.000000\n",
"3 Logistic Regression 0.945153 0.947077 0.942748 0.947570\n",
"4 Decision Tree 1.000000 1.000000 1.000000 1.000000\n",
"5 Naive Bayes 0.919671 0.921846 0.909887 0.929668\n",
"6 AdaBoost 1.000000 1.000000 1.000000 1.000000\n",
"7 Gradient Boosting 1.000000 1.000000 1.000000 1.000000"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_df"
]
},
{
"cell_type": "markdown",
"id": "55d00c78",
"metadata": {},
"source": [
"- **F1 Score** balances precision and recall, providing a single metric that considers both false positives and false negatives.\n",
"- **Accuracy** provides an overall measure of correct predictions but may not be suitable for imbalanced datasets.\n",
"- **Precision** focuses on minimizing false positives, useful when the cost of false positives is high.\n",
"- **Recall** focuses on minimizing false negatives, useful when the cost of false negatives is high."
]
},
{
"cell_type": "markdown",
"id": "3b44321e",
"metadata": {},
"source": [
"# Which features are most indicative of a poisonous mushroom?"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "c8620ac6",
"metadata": {},
"outputs": [],
"source": [
"# Create an empty dictionary to store feature importance scores\n",
"feature_importance = {}"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "e9e81c6c",
"metadata": {},
"outputs": [],
"source": [
"# Iterate through each model in the 'models' dictionary\n",
"for model_name, model in models.items():\n",
" # Assuming RandomForestClassifier is used, adapt for other models\n",
" if model_name == 'Random Forest':\n",
" # Get feature importance from the trained model\n",
" importances = model.feature_importances_\n",
"\n",
" # Map feature names to their importance scores\n",
" feature_importance[model_name] = dict(zip(X.columns, importances))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "68387008",
"metadata": {},
"outputs": [],
"source": [
"# Convert the dictionary to a DataFrame for better visualization\n",
"feature_importance_df = pd.DataFrame(feature_importance).T"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "4653e9e9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bruises</th>\n",
" <th>cap-color</th>\n",
" <th>cap-shape</th>\n",
" <th>cap-surface</th>\n",
" <th>gill-attachment</th>\n",
" <th>gill-color</th>\n",
" <th>gill-size</th>\n",
" <th>gill-spacing</th>\n",
" <th>habitat</th>\n",
" <th>odor</th>\n",
" <th>...</th>\n",
" <th>ring-type</th>\n",
" <th>spore-print-color</th>\n",
" <th>stalk-color-above-ring</th>\n",
" <th>stalk-color-below-ring</th>\n",
" <th>stalk-root</th>\n",
" <th>stalk-shape</th>\n",
" <th>stalk-surface-above-ring</th>\n",
" <th>stalk-surface-below-ring</th>\n",
" <th>veil-color</th>\n",
" <th>veil-type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Random Forest</th>\n",
" <td>0.05007</td>\n",
" <td>0.013889</td>\n",
" <td>0.004782</td>\n",
" <td>0.008458</td>\n",
" <td>0.002411</td>\n",
" <td>0.113684</td>\n",
" <td>0.146035</td>\n",
" <td>0.047978</td>\n",
" <td>0.033642</td>\n",
" <td>0.134474</td>\n",
" <td>...</td>\n",
" <td>0.066536</td>\n",
" <td>0.102494</td>\n",
" <td>0.011799</td>\n",
" <td>0.018804</td>\n",
" <td>0.064496</td>\n",
" <td>0.021023</td>\n",
" <td>0.043097</td>\n",
" <td>0.046428</td>\n",
" <td>0.002559</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" bruises cap-color cap-shape cap-surface gill-attachment \\\n",
"Random Forest 0.05007 0.013889 0.004782 0.008458 0.002411 \n",
"\n",
" gill-color gill-size gill-spacing habitat odor ... \\\n",
"Random Forest 0.113684 0.146035 0.047978 0.033642 0.134474 ... \n",
"\n",
" ring-type spore-print-color stalk-color-above-ring \\\n",
"Random Forest 0.066536 0.102494 0.011799 \n",
"\n",
" stalk-color-below-ring stalk-root stalk-shape \\\n",
"Random Forest 0.018804 0.064496 0.021023 \n",
"\n",
" stalk-surface-above-ring stalk-surface-below-ring veil-color \\\n",
"Random Forest 0.043097 0.046428 0.002559 \n",
"\n",
" veil-type \n",
"Random Forest 0.0 \n",
"\n",
"[1 rows x 22 columns]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_importance_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "ce94b8d5",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzoAAAHwCAYAAABuXRpPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABiGUlEQVR4nO3deZRdVZn38e+PgIQxqCBvADWoEQQSAhTIpILSTjiLoqIEJxpn2kYbG0WwtYWGbmkFRbQFVFQE0WZQAZFZpoQEAoraDUEFWkUxECYhPO8f9xReiqrKTapuVeXm+1nrrjp3nz0851StVD3Z++ybqkKSJEmSeskq4x2AJEmSJI02Ex1JkiRJPcdER5IkSVLPMdGRJEmS1HNMdCRJkiT1HBMdSZIkST3HREeSJE0oSS5K8q7xjkPSis1ER5LUkSQLk9yfZHHba6NR6HOP0Yqxg/EOS/LNsRpvOEn2S3LZeMcxUJNkPNB8fxcluSTJjFEe4wnN9+LXSe5tfg6+lmTaaI7TNt5JST7djb4lTVwmOpKkZfHKqlq77XX7eAaTZNXxHH95rQBxv7+q1gaeDFwEfGN5OhnmOk8HXgW8BZgCbA3MBV60POMsJYZJo92npBWDiY4kaUSSTEnyX0nuSHJbkk/3/3GZ5JlJfprkT0nuTHJKkvWac98Angac1cwefDTJbkl+N6D/R2d9mlmA05N8M8ndwH7Djd9B7JXkvc3Mwj1J/qWJ+Yokdyf5bpInNHV3S/K7JP/cXMvCJPsMuA9fT/LHJLcm+XiSVZpz+yW5PMnnkvwZOBU4Htipufa/NPX2TDKvGfu3SQ5r639aE+/sJL9pYjik7fykJrb/ba5lbpKnNuc2T3J+kj8n+WWSN3Zyf6rqYeA7wBZt4+zQ3J+/NPf82P571HZP35fk18CvB7nnewB/B7y6qq6pqoeralFVHVdV/9VW9enNPbsnyXlJ1m/r47Qk/9c247Rl27mTknwpyQ+T3Au8E9gH+Ghzr8/q5NolrfhMdCRJI3Uy8DDwLGAb4MVA//MVAT4LbAQ8B3gqcBhAVb0N+A1/myX6tw7HezWtGYH1gFOWMn4nXgpsB+wIfBQ4gdYfxk8FtgLe3Fb3/wHrAxsDs4ETkmzWnPsCrdmJZwAvAPYF3t7W9rnAzcBTgLcCBwBXNNe+XlPn3qbdesCewHuSvGZAvLsCm9Ga/Tg0yXOa8g83sb4cWBd4B3BfkrWA84FvNWO/Gfhie3IwlCaB2Qe4sq14CfAPzX3YqYnjvQOavqa53i14vD2Aq6vqt0sZ/i207t9TgCcAB7Wd+xEwvTl3La2fg4FtPwOsA3y9Of9vzb1+5VLGldQjTHQkScviB83/5P8lyQ+SbAi8DDiwqu6tqj8AnwPeBFBV/1NV51fVg1X1R+A/aCUBI3FFVf2gqh6h9Qf9kON36MiquruqbgRuAM6rqpurahGtP6i3GVD/E831XAycA7yxmUHaG/hYVd1TVQuBfwfe1tbu9qr6QjODcf9ggVTVRVW1oKoeqarrgW/z+Pt1eFXdX1XXAdfRWvYFreTu41X1y2q5rqr+BLwCWFhVJzZjXwt8D9hrmHvy+WaWaTHwfuDwthjnVtWVTV8LgS8PEuNnq+rPQ1znk4E7hhm734lV9aumj+8Cs9pi+Fpznx+klThvnWRKW9v/rqrLm/v4QAdjSepBE32NsCRpYnlNVf2k/02SHYDVgDuS9BevAvy2Of8U4PPA82j97/oqwF0jjKF9JuDpw43fod+3Hd8/yPv/1/b+rqq6t+39rbRmq9anNetw64BzGw8R96CSPBc4gtZM0hOA1YHTBlT7v7bj+4C1m+OnAv87SLdPB57bvzyusSrDP3fzwar6arP0bhfgzCQvqKrrkzybVsLaB6zZ9DV3QPvhrvVPwLOHOd9v0OtsksrPAG8ANgAeaeqsDyzqYHxJKwlndCRJI/Fb4EFg/apar3mtW1X9y6I+CxQws6rWpbVkK23ta0B/99L64xl49I/aDQbUaW+ztPFH2xObpWD9ngbcDtwJPEQrqWg/d9sQcQ/2HlrLy84EnlpVU2g9x5NB6g3mt8Azhyi/uO3+rNcs4XrP0jpsZkQuBf6H1pJAgC8BNwHTm+/pPw8S42DX1u8nwA5JNlna+EN4C63li3vQWio4rSkf7udquHgk9SgTHUnScquqO4DzgH9Psm6SVZqH+fuXMq1Da/nTX5JsDHxkQBe/p/VMS79fAZObh/JXAz5Oa1ZjecfvhsPT2h75ebSWhZ1WVUtoLa/6TJJ1kjyd1jMzw21l/Xtgk/YH+Wndrz9X1QPNbNlbliGurwL/kmR6WmYmeTJwNvDsJG9Lslrz2r7t2Z5hJdmJ1rM2N7bFeDewOMnmwFITpnbNjOD5wPeTbJdk1eaeHZDkHR10sQ6t5PZPtJLif+2gzcCfM0krARMdSdJI7UtrmdXPaS1LOx2Y2pw7HNiW1pKic4AzBrT9LPDx5pmfg5rnYt5L64/222jN8PyO4Q03/mj7v2aM22k94H5AVd3UnPsArXhvBi6jNTvztWH6+imt5OH/ktzZlL0X+FSSe4BDaSVPnfqPpv55tBKR/wLWqKp7aM3GvKmJ+/+AIxkmgQSObXYoW0xridvHq+pHzbmDaCVg9wBfobWD3LLaC/hh03YRrWej+mjN9izN12ktC7yN1vf8yuGrA617sUX/s2XLEa+kFVCqnM2VJGlpkuwGfLOqlnfJlSRpDDmjI0mSJKnnmOhIkiRJ6jkuXZMkSZLUc5zRkSRJktRzTHQkSZIk9ZxVxzsA9ab111+/pk2bNt5hSJIkqcfNnTv3zqoa+OHSJjrqjmnTpjFnzpzxDkOSJEk9Lsmtg5W7dE2SJElSzzHRkSRJktRzTHQkSZIk9Ryf0ZEkSdJK5aGHHuJ3v/sdDzzwwHiHomUwefJkNtlkE1ZbbbWO6pvoSJIkaaXyu9/9jnXWWYdp06aRZLzDUQeqij/96U/87ne/Y9NNN+2ojUvXJEmStFJ54IEHePKTn2ySswJJwpOf/ORlmoUz0ZEkSdJKxyRnxbOs3zMTHUmSJGmMTZo0iVmzZrHVVlvxyle+kr/85S+j0u9JJ53E+9///lHpq91uu+3GZpttxqxZs5g1axann376qI8BsHDhQr71rW+NSl8+oyNJkqSV2rSDzxnV/hYesedS66yxxhrMnz8fgNmzZ3PcccdxyCGHjGoco+2UU06hr69vmdo8/PDDrLpq5ylHf6Lzlre8ZVnDexxndCRJkqRxtNNOO3HbbbcBcPXVV7PzzjuzzTbbsPPOO/PLX/4SaM3UvO51r+OlL30p06dP56Mf/eij7U888USe/exn84IXvIDLL7/80fJbb72VF73oRcycOZMXvehF/OY3vwFgv/324z3veQ+77747z3jGM7j44ot5xzvewXOe8xz222+/juP+85//zGte8xpmzpzJjjvuyPXXXw/AYYcdxv7778+LX/xi9t13X/74xz/y+te/nu23357tt9/+0RgvvvjiR2eIttlmG+655x4OPvhgLr30UmbNmsXnPve5Ed1XZ3QkSZKkcbJkyRIuuOAC3vnOdwKw+eabc8kll7Dqqqvyk5/8hH/+53/me9/7HgDz589n3rx5rL766my22WZ84AMfYNVVV+WTn/wkc+fOZcqUKey+++5ss802ALz//e9n3333Zfbs2Xzta1/jgx/8ID/4wQ8AuOuuu/jpT3/KmWeeyStf+Uouv/xyvvrVr7L99tszf/58Zs2a9bhY99lnH9ZYYw0ALrjgAg477DC22WYbfvCDH/DTn/6Ufffd99FZqrlz53LZZZexxhpr8Ja3vIV/+Id/YNddd+U3v/kNL3nJS/jFL37B0UcfzXHHHccuu+zC4sWLmTx5MkcccQRHH300Z5999ojvrYmOJEmSNMbuv/9+Zs2axcKFC9luu+34u7/7OwAWLVrE7Nmz+fWvf00SHnrooUfbvOhFL2LKlCkAbLHFFtx6663ceeed7LbbbmywwQYA7L333vzqV78C4IorruCMM84A4G1ve9tjZoFe+cpXkoQZM2aw4YYbMmPGDAC23HJ
"text/plain": [
"<Figure size 864x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Plot the feature importance for each feature / column\n",
"plt.figure(figsize=(12, 8))\n",
"for model_name, model_importance in feature_importance.items():\n",
" plt.barh(list(model_importance.keys()), list(model_importance.values()), label=model_name)\n",
"\n",
"plt.xlabel('Feature Importance')\n",
"plt.ylabel('Features / Columns')\n",
"plt.title('Feature Importance Bar Chart')\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e08cfac",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}