In [1]:
import pandas as pd
import numpy as np
In [4]:
data=pd.read_excel(r"C:\Users\satvi\Downloads\gtex_integrin_7_organs.xlsx")
In [5]:
data
Out[5]:
| Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GTEX-13QIC-0011-R1a-SM-5O9CJ | Brain | 0.5763 | -6.5064 | 2.2573 | 0.7832 | 1.0363 | 4.6035 | 2.5731 | -2.8262 | ... | 2.8562 | 1.3846 | 5.8430 | 1.1316 | -0.7108 | 3.5387 | -0.0725 | -0.4521 | 0.2029 | -2.8262 |
| 1 | GTEX-1399S-1726-SM-5L3DI | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
| 2 | GTEX-PWCY-1326-SM-48TCU | Ovary | 2.3953 | -5.0116 | 1.4547 | 4.2593 | -0.7346 | 4.4149 | 0.2642 | 1.5216 | ... | 3.6816 | 1.5465 | 7.2964 | -0.9406 | 2.7742 | 5.0414 | 2.0325 | 0.7579 | 2.2573 | 1.2516 |
| 3 | GTEX-QXCU-0626-SM-2TC69 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
| 4 | GTEX-ZA64-1526-SM-5CVMD | Breast | 2.0569 | -2.4659 | 3.3993 | 3.1311 | 3.0074 | 4.4977 | -1.7809 | 2.7139 | ... | 4.7340 | 0.6332 | 7.3496 | -0.9406 | 2.5338 | 6.5696 | 1.7229 | -0.6416 | 3.1195 | 1.1050 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1982 | GTEX-QMRM-0826-SM-3NB33 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
| 1983 | GTEX-YFCO-1626-SM-4W1Z3 | Prostate | 2.9581 | -4.6082 | 1.1641 | 4.6938 | 1.5902 | 5.8625 | -0.5125 | 1.7617 | ... | 3.8798 | -1.4699 | 7.5163 | -0.3752 | 2.9562 | 5.3035 | 4.4304 | -0.9406 | 3.6136 | 0.4233 |
| 1984 | GTEX-1117F-2826-SM-5GZXL | Breast | 4.3184 | -6.5064 | 1.0433 | 4.8440 | 3.5498 | 4.6809 | 1.0293 | 3.3478 | ... | 5.3256 | -0.0725 | 7.7516 | 1.1382 | 2.1411 | 7.1132 | 0.3796 | 0.0854 | 3.8650 | 1.0151 |
| 1985 | GTEX-Q2AG-2826-SM-2HMJQ | Brain | 3.4622 | -5.5735 | 1.5013 | 5.4835 | 1.7702 | 4.7517 | 0.6790 | -3.1714 | ... | 1.1960 | 4.1740 | 4.3002 | 0.5470 | -0.9971 | 3.7982 | -0.2498 | 1.4808 | -0.5125 | -0.5125 |
| 1986 | GTEX-XV7Q-0426-SM-4BRVN | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
1987 rows × 29 columns
In [6]:
integrins=pd.read_excel(r"C:\Users\satvi\Downloads\gtex_integrin_7_organs.xlsx")
In [7]:
integrins
Out[7]:
| Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GTEX-13QIC-0011-R1a-SM-5O9CJ | Brain | 0.5763 | -6.5064 | 2.2573 | 0.7832 | 1.0363 | 4.6035 | 2.5731 | -2.8262 | ... | 2.8562 | 1.3846 | 5.8430 | 1.1316 | -0.7108 | 3.5387 | -0.0725 | -0.4521 | 0.2029 | -2.8262 |
| 1 | GTEX-1399S-1726-SM-5L3DI | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
| 2 | GTEX-PWCY-1326-SM-48TCU | Ovary | 2.3953 | -5.0116 | 1.4547 | 4.2593 | -0.7346 | 4.4149 | 0.2642 | 1.5216 | ... | 3.6816 | 1.5465 | 7.2964 | -0.9406 | 2.7742 | 5.0414 | 2.0325 | 0.7579 | 2.2573 | 1.2516 |
| 3 | GTEX-QXCU-0626-SM-2TC69 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
| 4 | GTEX-ZA64-1526-SM-5CVMD | Breast | 2.0569 | -2.4659 | 3.3993 | 3.1311 | 3.0074 | 4.4977 | -1.7809 | 2.7139 | ... | 4.7340 | 0.6332 | 7.3496 | -0.9406 | 2.5338 | 6.5696 | 1.7229 | -0.6416 | 3.1195 | 1.1050 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1982 | GTEX-QMRM-0826-SM-3NB33 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
| 1983 | GTEX-YFCO-1626-SM-4W1Z3 | Prostate | 2.9581 | -4.6082 | 1.1641 | 4.6938 | 1.5902 | 5.8625 | -0.5125 | 1.7617 | ... | 3.8798 | -1.4699 | 7.5163 | -0.3752 | 2.9562 | 5.3035 | 4.4304 | -0.9406 | 3.6136 | 0.4233 |
| 1984 | GTEX-1117F-2826-SM-5GZXL | Breast | 4.3184 | -6.5064 | 1.0433 | 4.8440 | 3.5498 | 4.6809 | 1.0293 | 3.3478 | ... | 5.3256 | -0.0725 | 7.7516 | 1.1382 | 2.1411 | 7.1132 | 0.3796 | 0.0854 | 3.8650 | 1.0151 |
| 1985 | GTEX-Q2AG-2826-SM-2HMJQ | Brain | 3.4622 | -5.5735 | 1.5013 | 5.4835 | 1.7702 | 4.7517 | 0.6790 | -3.1714 | ... | 1.1960 | 4.1740 | 4.3002 | 0.5470 | -0.9971 | 3.7982 | -0.2498 | 1.4808 | -0.5125 | -0.5125 |
| 1986 | GTEX-XV7Q-0426-SM-4BRVN | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
1987 rows × 29 columns
In [8]:
data_liver_lung = integrins[integrins['primary_site'].isin(['Liver', 'Lung'])] #filter data by organ, display both brain and lung data
data_liver_lung
Out[8]:
| Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | GTEX-1399S-1726-SM-5L3DI | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
| 3 | GTEX-QXCU-0626-SM-2TC69 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
| 5 | GTEX-11EI6-0826-SM-5985V | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
| 6 | GTEX-S341-0326-SM-2XCAU | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
| 7 | GTEX-WY7C-0426-SM-3NB3C | Lung | 3.3633 | -2.5479 | 4.8340 | 6.6864 | 3.0585 | 4.8294 | 2.6464 | 0.7999 | ... | 5.1190 | 1.5013 | 8.0260 | 3.6635 | 3.2435 | 5.8503 | 5.2991 | 2.8076 | 4.7571 | -0.1345 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1969 | GTEX-13FTZ-0726-SM-5IFFY | Liver | -0.6873 | -3.4580 | -0.5125 | -0.3566 | -0.4921 | 3.0654 | -4.0350 | -1.5951 | ... | 0.9493 | -1.9942 | 5.2563 | 2.5924 | -0.3752 | 4.5053 | -4.6082 | -2.2447 | 3.1458 | -2.8262 |
| 1970 | GTEX-RWS6-0226-SM-2XCA9 | Lung | 6.0830 | -0.5756 | 4.3889 | 6.7302 | 4.6053 | 5.1065 | 2.8321 | 0.9716 | ... | 5.8176 | 2.5437 | 7.7929 | 4.9012 | 2.7993 | 6.7510 | 5.2204 | 2.8422 | 5.0951 | -0.3201 |
| 1975 | GTEX-131XE-0726-SM-5HL9K | Lung | 3.7971 | -1.9379 | 4.8555 | 6.4052 | 3.9561 | 5.4263 | 3.2959 | 4.5199 | ... | 4.6697 | 6.5777 | 7.5114 | 5.2130 | 2.3816 | 6.6225 | 3.7389 | 3.7248 | 5.6809 | 0.8488 |
| 1982 | GTEX-QMRM-0826-SM-3NB33 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
| 1986 | GTEX-XV7Q-0426-SM-4BRVN | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
398 rows × 29 columns
In [9]:
data_liver_lung_expression_only=data_liver_lung.iloc[:,1:]
In [10]:
data_liver_lung_expression_only
Out[10]:
| primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ITGA7 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | 3.9270 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
| 3 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | 4.5355 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
| 5 | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | 5.0527 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
| 6 | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | 5.2392 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
| 7 | Lung | 3.3633 | -2.5479 | 4.8340 | 6.6864 | 3.0585 | 4.8294 | 2.6464 | 0.7999 | 4.9246 | ... | 5.1190 | 1.5013 | 8.0260 | 3.6635 | 3.2435 | 5.8503 | 5.2991 | 2.8076 | 4.7571 | -0.1345 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1969 | Liver | -0.6873 | -3.4580 | -0.5125 | -0.3566 | -0.4921 | 3.0654 | -4.0350 | -1.5951 | 2.3337 | ... | 0.9493 | -1.9942 | 5.2563 | 2.5924 | -0.3752 | 4.5053 | -4.6082 | -2.2447 | 3.1458 | -2.8262 |
| 1970 | Lung | 6.0830 | -0.5756 | 4.3889 | 6.7302 | 4.6053 | 5.1065 | 2.8321 | 0.9716 | 3.2973 | ... | 5.8176 | 2.5437 | 7.7929 | 4.9012 | 2.7993 | 6.7510 | 5.2204 | 2.8422 | 5.0951 | -0.3201 |
| 1975 | Lung | 3.7971 | -1.9379 | 4.8555 | 6.4052 | 3.9561 | 5.4263 | 3.2959 | 4.5199 | 5.5589 | ... | 4.6697 | 6.5777 | 7.5114 | 5.2130 | 2.3816 | 6.6225 | 3.7389 | 3.7248 | 5.6809 | 0.8488 |
| 1982 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | 5.2032 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
| 1986 | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | 7.7121 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
398 rows × 28 columns
In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGA10']]
y=data_liver_lung_expression_only['primary_site']
#define split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[11], line 1 ----> 1 from sklearn.model_selection import train_test_split 2 from sklearn.linear_model import LogisticRegression 3 from sklearn.metrics import accuracy_score ModuleNotFoundError: No module named 'sklearn'
In [12]:
#swithc ITGA10 to ITGB4, and see how that impact its accurancy
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGB4']]
y=data_liver_lung_expression_only['primary_site']
#define split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[12], line 7 4 y=data_liver_lung_expression_only['primary_site'] 6 #define split between train and test ----> 7 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) 9 #define the model you want to use : logistic regression 10 model = LogisticRegression() NameError: name 'train_test_split' is not defined
In [13]:
!pip install scikit-learn
Collecting scikit-learn
Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
--------------------------------------- 11.2/11.2 MB 38.5 MB/s eta 0:00:00
Collecting scipy>=1.6.0
Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
--------------------------------------- 46.2/46.2 MB 31.1 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.19.5 in c:\users\satvi\appdata\local\programs\python\python39\lib\site-packages (from scikit-learn) (2.0.2)
Collecting joblib>=1.2.0
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
------------------------------------- 308.4/308.4 KB 18.6 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.2 scikit-learn-1.6.1 scipy-1.13.1 threadpoolctl-3.6.0
WARNING: You are using pip version 22.0.4; however, version 25.2 is available. You should consider upgrading via the 'C:\Users\satvi\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.
In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGA10']]
y=data_liver_lung_expression_only['primary_site']
#define split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 1.00
In [15]:
#swithc ITGA10 to ITGB4, and see how that impact its accurancy
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGB4']]
y=data_liver_lung_expression_only['primary_site']
#define split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")
Accuracy using ITGB4: 0.97
In [16]:
len(y_test)
Out[16]:
160
In [18]:
#AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGB4']] # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGB4 expression')
plt.legend()
plt.grid(True)
plt.show()
In [19]:
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")
Accuracy using ITGB4: 0.98
In [20]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
# Step 1: Predict labels (not probabilities)
y_pred = model.predict(X_test)
# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung'] # same order as your encoded classes (0 and 1)
# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGB4')
plt.tight_layout()
plt.show()
In [21]:
#AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']] # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
In [22]:
#trying different machine learning models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
#Xtain, ytrain defined earlier
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']] # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site']
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 4: Train KNN model
knn = KNeighborsClassifier(n_neighbors=3) # try 3 or 5
knn.fit(X_train, y_train)
# Step 5: Predict
y_pred = knn.predict(X_test)
# Step 6: Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Liver', 'Lung']))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Brain', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('KNN Confusion Matrix (Liver vs Lung)')
plt.show()
✅ Accuracy: 0.98
Classification Report:
precision recall f1-score support
Liver 0.93 1.00 0.96 25
Lung 1.00 0.98 0.99 95
accuracy 0.98 120
macro avg 0.96 0.99 0.98 120
weighted avg 0.98 0.98 0.98 120
In [25]:
# trying using two ITGs with KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
# 1. Choose 2 integrins for 2D plot
X = data_liver_lung_expression_only[['ITGA10', 'ITGB1']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})
# 2. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# 4. Train KNN
#knn = KNeighborsClassifier(n_neighbors=5)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
# 5. Create meshgrid for background prediction surface
h = 0.02 # grid step
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# 6. Predict over the mesh
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 7. Plot
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='bwr')
# Plot training points
sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=y,
palette={0: 'blue', 1: 'red'}, edgecolor='k', alpha=0.8)
plt.xlabel('ITGA10 (scaled)')
plt.ylabel('ITGB1 (scaled)')
plt.title('KNN Neighborhood and Decision Boundary')
plt.legend(title='True Label', labels=['Liver', 'Lung'])
plt.grid(True)
plt.tight_layout()
plt.show()
In [26]:
y_pred = knn.predict(X_test)
# Step 6: Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Liver', 'Lung']))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Liver', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('KNN Confusion Matrix (Liver vs Lung)')
plt.show()
✅ Accuracy: 0.99
Classification Report:
precision recall f1-score support
Liver 0.96 1.00 0.98 25
Lung 1.00 0.99 0.99 95
accuracy 0.99 120
macro avg 0.98 0.99 0.99 120
weighted avg 0.99 0.99 0.99 120
In [27]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# Predict class probabilities
y_proba = knn.predict_proba(X_test)[:, 1] # probability of class 1 (Lung)
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc_score = roc_auc_score(y_test, y_proba)
# Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'KNN (AUC = {auc_score:.2f})', color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) - KNN')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
In [28]:
#get the model accuray for this ITGA10
#AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']] # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 1.00
In [29]:
#get confusion matrix and calculate your own accurancy, precision and sensitivity
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Liver', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Liver vs Lung)')
plt.show()
In [30]:
from sklearn.metrics import roc_curve, accuracy_score
# After model is trained and you have predicted probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Lung class = 1
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# Now find the threshold that gives highest accuracy
accuracies = []
for thresh in thresholds:
y_pred_thresh = (y_proba >= thresh).astype(int)
acc = accuracy_score(y_test, y_pred_thresh)
accuracies.append(acc)
# Find best threshold
best_idx = np.argmax(accuracies)
best_threshold = thresholds[best_idx]
best_accuracy = accuracies[best_idx]
print(f"🎯 Best threshold: {best_threshold:.3f}")
print(f"✅ Highest accuracy at this threshold: {best_accuracy:.3f}")
🎯 Best threshold: 0.612 ✅ Highest accuracy at this threshold: 1.000
In [31]:
a = model.coef_[0][0]
b = model.intercept_[0]
p_thresh = best_threshold # the best probability threshold
# Invert the sigmoid function
ITGA10_thresh = (np.log(p_thresh / (1 - p_thresh)) - b) / a
print(f"🧬 ITGA10 expression threshold ≈ {ITGA10_thresh:.3f}")
🧬 ITGA10 expression threshold ≈ 1.876
In [32]:
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 1.00
In [33]:
from sklearn.metrics import accuracy_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np
# Already computed:
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# Compute accuracy at each threshold
accuracies = []
for thresh in thresholds:
y_pred_thresh = (y_proba >= thresh).astype(int)
acc = accuracy_score(y_test, y_pred_thresh)
accuracies.append(acc)
# Plot accuracy vs threshold
plt.figure(figsize=(8, 5))
plt.plot(thresholds, accuracies, marker='o')
plt.xlabel("Probability Threshold")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Threshold (ITGA10)")
plt.grid(True)
# Mark the best threshold
best_idx = np.argmax(accuracies)
plt.axvline(thresholds[best_idx], color='red', linestyle='--', label=f"Best: {thresholds[best_idx]:.2f}")
plt.legend()
plt.tight_layout()
plt.show()
In [34]:
print("Model coefficient:", model.coef_)
print("Model intercept:", model.intercept_)
Model coefficient: [[1.6319934]] Model intercept: [-2.60809743]
In [35]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
# Step 1: Predict labels (not probabilities)
y_pred = model.predict(X_test)
# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung'] # same order as your encoded classes (0 and 1)
# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()
In [36]:
print("Model coefficient:", model.coef_)
print("Model intercept:", model.intercept_)
Model coefficient: [[1.6319934]] Model intercept: [-2.60809743]
In [37]:
decision_scores = model.decision_function(X_test)
print(decision_scores)
[ 5.18662945 6.05240195 7.85787625 3.05704126 2.46120047 5.19087264 5.80025897 -3.61879094 1.29660998 3.60898143 4.55357921 -1.17112724 6.58851178 3.34068172 2.75398009 1.85311973 -5.00696453 4.10935061 -4.33131926 3.9376649 2.68527317 -3.25110283 5.89377219 4.51147378 5.41102855 7.31931843 6.44652836 5.31115055 -3.31393457 -2.58231193 -3.72976649 5.35635677 -0.97496163 2.64006695 4.91832974 -2.33914492 2.78515116 3.71734579 5.73514243 4.62440773 4.64725563 3.30853145 2.44194295 2.82285021 5.59789179 3.83615491 3.78523672 4.0377061 4.57707992 2.54965452 -4.81912209 5.34036323 -3.88660106 3.98238152 4.96125116 -4.81912209 -1.18401999 8.60190204 2.62488941 6.56860146 -2.19650869 -2.33914492 3.70771703 5.24015884 -2.85354924 2.9991055 4.31710337 3.65728844 4.67614192 0.94409941 2.32329703 3.7157138 5.54746319 1.87776283 -1.52967619 -1.60523748 5.37789908 2.29702194 6.73979757 4.16973436 4.32950652 -2.67745715 -2.13792013 6.86072828 2.64773732 -5.95841668 3.93178973 -3.25110283 5.07744909 5.32502249 4.05712682 -4.58966382 7.0043437 1.9603417 5.56002954 0.45384859 5.39536141 4.60025422 4.88438427 5.90209536 5.97129188 8.15489905 1.52704745 7.30789447 0.52957308 7.90618326 6.87492663 4.37128555 7.90128728 5.58124546 4.19992624 6.95864789 4.73309849 2.26748286 5.69124181 5.42800128 -3.80695978 1.23296224 4.41698136 6.52519044]
In [39]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA3','ITGB4']] # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA3 and ITGB4expression')
plt.legend()
plt.grid(True)
plt.show()
In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Step 1: Prepare features and target
selected_genes = ['ITGA10', 'ITGB4']
#X = integrins.iloc[:, -27:] # Assuming the last 27 columns are integrins
X = integrins[selected_genes] # using only selected gene; in this case just ITGA10 and ITGB4
y = integrins['primary_site']
# Step 2: Encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
# Step 4: Train multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
# Step 5: Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Accuracy: 0.7939698492462312
Classification Report:
precision recall f1-score support
Bone Marrow 0.77 1.00 0.87 10
Brain 0.81 0.94 0.87 247
Breast 0.64 0.41 0.50 44
Liver 1.00 0.65 0.79 23
Lung 0.76 0.88 0.82 43
Ovary 0.50 0.10 0.17 10
Prostate 0.75 0.14 0.24 21
accuracy 0.79 398
macro avg 0.75 0.59 0.61 398
weighted avg 0.78 0.79 0.77 398
Confusion Matrix:
[[ 10 0 0 0 0 0 0]
[ 3 231 3 0 8 1 1]
[ 0 25 18 0 1 0 0]
[ 0 8 0 15 0 0 0]
[ 0 4 1 0 38 0 0]
[ 0 6 0 0 3 1 0]
[ 0 12 6 0 0 0 3]]
C:\Users\satvi\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning. warnings.warn(
In [41]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
In [42]:
integrins['primary_site'].value_counts()
Out[42]:
primary_site Brain 1152 Lung 288 Breast 179 Liver 110 Prostate 100 Ovary 88 Bone Marrow 70 Name: count, dtype: int64
In [43]:
integrins.shape
Out[43]:
(1987, 29)
In [44]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 0.79
In [45]:
data_liver_lung = integrins[integrins['primary_site'].isin(['Liver', 'Lung'])] #filter data by organ, display both brain and lung data
data_liver_lung
Out[45]:
| Unnamed: 0 | primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | GTEX-1399S-1726-SM-5L3DI | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
| 3 | GTEX-QXCU-0626-SM-2TC69 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
| 5 | GTEX-11EI6-0826-SM-5985V | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
| 6 | GTEX-S341-0326-SM-2XCAU | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
| 7 | GTEX-WY7C-0426-SM-3NB3C | Lung | 3.3633 | -2.5479 | 4.8340 | 6.6864 | 3.0585 | 4.8294 | 2.6464 | 0.7999 | ... | 5.1190 | 1.5013 | 8.0260 | 3.6635 | 3.2435 | 5.8503 | 5.2991 | 2.8076 | 4.7571 | -0.1345 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1969 | GTEX-13FTZ-0726-SM-5IFFY | Liver | -0.6873 | -3.4580 | -0.5125 | -0.3566 | -0.4921 | 3.0654 | -4.0350 | -1.5951 | ... | 0.9493 | -1.9942 | 5.2563 | 2.5924 | -0.3752 | 4.5053 | -4.6082 | -2.2447 | 3.1458 | -2.8262 |
| 1970 | GTEX-RWS6-0226-SM-2XCA9 | Lung | 6.0830 | -0.5756 | 4.3889 | 6.7302 | 4.6053 | 5.1065 | 2.8321 | 0.9716 | ... | 5.8176 | 2.5437 | 7.7929 | 4.9012 | 2.7993 | 6.7510 | 5.2204 | 2.8422 | 5.0951 | -0.3201 |
| 1975 | GTEX-131XE-0726-SM-5HL9K | Lung | 3.7971 | -1.9379 | 4.8555 | 6.4052 | 3.9561 | 5.4263 | 3.2959 | 4.5199 | ... | 4.6697 | 6.5777 | 7.5114 | 5.2130 | 2.3816 | 6.6225 | 3.7389 | 3.7248 | 5.6809 | 0.8488 |
| 1982 | GTEX-QMRM-0826-SM-3NB33 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
| 1986 | GTEX-XV7Q-0426-SM-4BRVN | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
398 rows × 29 columns
In [46]:
data_liver_lung_expression_only=data_liver_lung.iloc[:,1:]
In [47]:
data_liver_lung_expression_only
Out[47]:
| primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ITGA7 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | 3.9270 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
| 3 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | 4.5355 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
| 5 | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | 5.0527 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
| 6 | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | 5.2392 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
| 7 | Lung | 3.3633 | -2.5479 | 4.8340 | 6.6864 | 3.0585 | 4.8294 | 2.6464 | 0.7999 | 4.9246 | ... | 5.1190 | 1.5013 | 8.0260 | 3.6635 | 3.2435 | 5.8503 | 5.2991 | 2.8076 | 4.7571 | -0.1345 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1969 | Liver | -0.6873 | -3.4580 | -0.5125 | -0.3566 | -0.4921 | 3.0654 | -4.0350 | -1.5951 | 2.3337 | ... | 0.9493 | -1.9942 | 5.2563 | 2.5924 | -0.3752 | 4.5053 | -4.6082 | -2.2447 | 3.1458 | -2.8262 |
| 1970 | Lung | 6.0830 | -0.5756 | 4.3889 | 6.7302 | 4.6053 | 5.1065 | 2.8321 | 0.9716 | 3.2973 | ... | 5.8176 | 2.5437 | 7.7929 | 4.9012 | 2.7993 | 6.7510 | 5.2204 | 2.8422 | 5.0951 | -0.3201 |
| 1975 | Lung | 3.7971 | -1.9379 | 4.8555 | 6.4052 | 3.9561 | 5.4263 | 3.2959 | 4.5199 | 5.5589 | ... | 4.6697 | 6.5777 | 7.5114 | 5.2130 | 2.3816 | 6.6225 | 3.7389 | 3.7248 | 5.6809 | 0.8488 |
| 1982 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | 5.2032 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
| 1986 | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | 7.7121 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
398 rows × 28 columns
In [48]:
data_liver_lung_expression_only['primary_site'].value_counts()
Out[48]:
primary_site Lung 288 Liver 110 Name: count, dtype: int64
In [49]:
liver_lung_vertical = data_liver_lung_expression_only.melt(id_vars = 'primary_site', var_name = 'integrin_gene', value_name = 'expression_levels')
In [50]:
plt.figure(figsize=(16, 6))
sns.violinplot(x = 'integrin_gene', y = 'expression_levels', hue = 'primary_site', data = liver_lung_vertical, split = True, inner = 'quartile')
plt.title("Integrin Genes of the Lung vs. the Liver")
plt.xlabel("Integrin Gene")
plt.ylabel("Gene Expression Levels")
plt.legend(title = 'primary_site')
plt.show()
In [51]:
data_liver_lung_expression_only
Out[51]:
| primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ITGA7 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | 3.9270 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
| 3 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | 4.5355 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
| 5 | Lung | 6.0732 | -2.4659 | 3.9901 | 7.3945 | 4.7688 | 5.1157 | 4.3356 | 2.3366 | 5.0527 | ... | 3.7378 | 4.7247 | 7.5016 | 5.1396 | 2.5036 | 6.5443 | 4.6531 | 3.8136 | 5.8679 | 0.7407 |
| 6 | Lung | 4.2510 | -5.0116 | 3.3076 | 6.1715 | 3.1129 | 5.2954 | 2.2960 | 1.1184 | 5.2392 | ... | 4.7104 | 2.7530 | 7.5022 | 4.0730 | 2.6325 | 6.0483 | 5.0562 | 2.6962 | 5.1611 | 0.9343 |
| 7 | Lung | 3.3633 | -2.5479 | 4.8340 | 6.6864 | 3.0585 | 4.8294 | 2.6464 | 0.7999 | 4.9246 | ... | 5.1190 | 1.5013 | 8.0260 | 3.6635 | 3.2435 | 5.8503 | 5.2991 | 2.8076 | 4.7571 | -0.1345 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1969 | Liver | -0.6873 | -3.4580 | -0.5125 | -0.3566 | -0.4921 | 3.0654 | -4.0350 | -1.5951 | 2.3337 | ... | 0.9493 | -1.9942 | 5.2563 | 2.5924 | -0.3752 | 4.5053 | -4.6082 | -2.2447 | 3.1458 | -2.8262 |
| 1970 | Lung | 6.0830 | -0.5756 | 4.3889 | 6.7302 | 4.6053 | 5.1065 | 2.8321 | 0.9716 | 3.2973 | ... | 5.8176 | 2.5437 | 7.7929 | 4.9012 | 2.7993 | 6.7510 | 5.2204 | 2.8422 | 5.0951 | -0.3201 |
| 1975 | Lung | 3.7971 | -1.9379 | 4.8555 | 6.4052 | 3.9561 | 5.4263 | 3.2959 | 4.5199 | 5.5589 | ... | 4.6697 | 6.5777 | 7.5114 | 5.2130 | 2.3816 | 6.6225 | 3.7389 | 3.7248 | 5.6809 | 0.8488 |
| 1982 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | 5.2032 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
| 1986 | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | 7.7121 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
398 rows × 28 columns
In [52]:
X = data_liver_lung_expression_only['ITGA10']
X
Out[52]:
1 4.9137
3 4.0541
5 6.0732
6 4.2510
7 3.3633
...
1969 -0.6873
1970 6.0830
1975 3.7971
1982 5.3067
1986 2.5585
Name: ITGA10, Length: 398, dtype: float64
In [53]:
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})
y
Out[53]:
1 1
3 1
5 1
6 1
7 1
..
1969 0
1970 1
1975 1
1982 1
1986 1
Name: primary_site, Length: 398, dtype: int64
In [54]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
#X = data_liver_lung_expression_only[['ITGA3','ITGB4']] # 👈 Use your chosen integrin
X = data_liver_lung_expression_only[['ITGA10']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
In [57]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 1.00
In [56]:
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung'] # same order as your encoded classes (0 and 1)
# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()
In [58]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
#X = data_liver_lung_expression_only[['ITGA3','ITGB4']] # 👈 Use your chosen integrin
X = data_liver_lung_expression_only[['ITGB7']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1}) # Binary encoding
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for class "Lung"
# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGB7 expression')
plt.legend()
plt.grid(True)
plt.show()
In [59]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung'] # same order as your encoded classes (0 and 1)
# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()
Accuracy: 0.81
In [ ]: