In [1]:
import pandas as pd                                      
import numpy as np  
In [4]:
data=pd.read_excel(r"C:\Users\satvi\Downloads\gtex_integrin_7_organs.xlsx")
In [5]:
data
Out[5]:
Unnamed: 0 primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
0 GTEX-13QIC-0011-R1a-SM-5O9CJ Brain 0.5763 -6.5064 2.2573 0.7832 1.0363 4.6035 2.5731 -2.8262 ... 2.8562 1.3846 5.8430 1.1316 -0.7108 3.5387 -0.0725 -0.4521 0.2029 -2.8262
1 GTEX-1399S-1726-SM-5L3DI Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
2 GTEX-PWCY-1326-SM-48TCU Ovary 2.3953 -5.0116 1.4547 4.2593 -0.7346 4.4149 0.2642 1.5216 ... 3.6816 1.5465 7.2964 -0.9406 2.7742 5.0414 2.0325 0.7579 2.2573 1.2516
3 GTEX-QXCU-0626-SM-2TC69 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
4 GTEX-ZA64-1526-SM-5CVMD Breast 2.0569 -2.4659 3.3993 3.1311 3.0074 4.4977 -1.7809 2.7139 ... 4.7340 0.6332 7.3496 -0.9406 2.5338 6.5696 1.7229 -0.6416 3.1195 1.1050
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1982 GTEX-QMRM-0826-SM-3NB33 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1983 GTEX-YFCO-1626-SM-4W1Z3 Prostate 2.9581 -4.6082 1.1641 4.6938 1.5902 5.8625 -0.5125 1.7617 ... 3.8798 -1.4699 7.5163 -0.3752 2.9562 5.3035 4.4304 -0.9406 3.6136 0.4233
1984 GTEX-1117F-2826-SM-5GZXL Breast 4.3184 -6.5064 1.0433 4.8440 3.5498 4.6809 1.0293 3.3478 ... 5.3256 -0.0725 7.7516 1.1382 2.1411 7.1132 0.3796 0.0854 3.8650 1.0151
1985 GTEX-Q2AG-2826-SM-2HMJQ Brain 3.4622 -5.5735 1.5013 5.4835 1.7702 4.7517 0.6790 -3.1714 ... 1.1960 4.1740 4.3002 0.5470 -0.9971 3.7982 -0.2498 1.4808 -0.5125 -0.5125
1986 GTEX-XV7Q-0426-SM-4BRVN Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

1987 rows × 29 columns

In [6]:
integrins=pd.read_excel(r"C:\Users\satvi\Downloads\gtex_integrin_7_organs.xlsx")
In [7]:
integrins
Out[7]:
Unnamed: 0 primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
0 GTEX-13QIC-0011-R1a-SM-5O9CJ Brain 0.5763 -6.5064 2.2573 0.7832 1.0363 4.6035 2.5731 -2.8262 ... 2.8562 1.3846 5.8430 1.1316 -0.7108 3.5387 -0.0725 -0.4521 0.2029 -2.8262
1 GTEX-1399S-1726-SM-5L3DI Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
2 GTEX-PWCY-1326-SM-48TCU Ovary 2.3953 -5.0116 1.4547 4.2593 -0.7346 4.4149 0.2642 1.5216 ... 3.6816 1.5465 7.2964 -0.9406 2.7742 5.0414 2.0325 0.7579 2.2573 1.2516
3 GTEX-QXCU-0626-SM-2TC69 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
4 GTEX-ZA64-1526-SM-5CVMD Breast 2.0569 -2.4659 3.3993 3.1311 3.0074 4.4977 -1.7809 2.7139 ... 4.7340 0.6332 7.3496 -0.9406 2.5338 6.5696 1.7229 -0.6416 3.1195 1.1050
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1982 GTEX-QMRM-0826-SM-3NB33 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1983 GTEX-YFCO-1626-SM-4W1Z3 Prostate 2.9581 -4.6082 1.1641 4.6938 1.5902 5.8625 -0.5125 1.7617 ... 3.8798 -1.4699 7.5163 -0.3752 2.9562 5.3035 4.4304 -0.9406 3.6136 0.4233
1984 GTEX-1117F-2826-SM-5GZXL Breast 4.3184 -6.5064 1.0433 4.8440 3.5498 4.6809 1.0293 3.3478 ... 5.3256 -0.0725 7.7516 1.1382 2.1411 7.1132 0.3796 0.0854 3.8650 1.0151
1985 GTEX-Q2AG-2826-SM-2HMJQ Brain 3.4622 -5.5735 1.5013 5.4835 1.7702 4.7517 0.6790 -3.1714 ... 1.1960 4.1740 4.3002 0.5470 -0.9971 3.7982 -0.2498 1.4808 -0.5125 -0.5125
1986 GTEX-XV7Q-0426-SM-4BRVN Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

1987 rows × 29 columns

In [8]:
data_liver_lung = integrins[integrins['primary_site'].isin(['Liver', 'Lung'])]     #filter data by organ, display both brain and lung data
data_liver_lung
Out[8]:
Unnamed: 0 primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
1 GTEX-1399S-1726-SM-5L3DI Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
3 GTEX-QXCU-0626-SM-2TC69 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
5 GTEX-11EI6-0826-SM-5985V Lung 6.0732 -2.4659 3.9901 7.3945 4.7688 5.1157 4.3356 2.3366 ... 3.7378 4.7247 7.5016 5.1396 2.5036 6.5443 4.6531 3.8136 5.8679 0.7407
6 GTEX-S341-0326-SM-2XCAU Lung 4.2510 -5.0116 3.3076 6.1715 3.1129 5.2954 2.2960 1.1184 ... 4.7104 2.7530 7.5022 4.0730 2.6325 6.0483 5.0562 2.6962 5.1611 0.9343
7 GTEX-WY7C-0426-SM-3NB3C Lung 3.3633 -2.5479 4.8340 6.6864 3.0585 4.8294 2.6464 0.7999 ... 5.1190 1.5013 8.0260 3.6635 3.2435 5.8503 5.2991 2.8076 4.7571 -0.1345
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1969 GTEX-13FTZ-0726-SM-5IFFY Liver -0.6873 -3.4580 -0.5125 -0.3566 -0.4921 3.0654 -4.0350 -1.5951 ... 0.9493 -1.9942 5.2563 2.5924 -0.3752 4.5053 -4.6082 -2.2447 3.1458 -2.8262
1970 GTEX-RWS6-0226-SM-2XCA9 Lung 6.0830 -0.5756 4.3889 6.7302 4.6053 5.1065 2.8321 0.9716 ... 5.8176 2.5437 7.7929 4.9012 2.7993 6.7510 5.2204 2.8422 5.0951 -0.3201
1975 GTEX-131XE-0726-SM-5HL9K Lung 3.7971 -1.9379 4.8555 6.4052 3.9561 5.4263 3.2959 4.5199 ... 4.6697 6.5777 7.5114 5.2130 2.3816 6.6225 3.7389 3.7248 5.6809 0.8488
1982 GTEX-QMRM-0826-SM-3NB33 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1986 GTEX-XV7Q-0426-SM-4BRVN Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

398 rows × 29 columns

In [9]:
data_liver_lung_expression_only=data_liver_lung.iloc[:,1:]
In [10]:
data_liver_lung_expression_only
Out[10]:
primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ITGA7 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
1 Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 3.9270 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
3 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 4.5355 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
5 Lung 6.0732 -2.4659 3.9901 7.3945 4.7688 5.1157 4.3356 2.3366 5.0527 ... 3.7378 4.7247 7.5016 5.1396 2.5036 6.5443 4.6531 3.8136 5.8679 0.7407
6 Lung 4.2510 -5.0116 3.3076 6.1715 3.1129 5.2954 2.2960 1.1184 5.2392 ... 4.7104 2.7530 7.5022 4.0730 2.6325 6.0483 5.0562 2.6962 5.1611 0.9343
7 Lung 3.3633 -2.5479 4.8340 6.6864 3.0585 4.8294 2.6464 0.7999 4.9246 ... 5.1190 1.5013 8.0260 3.6635 3.2435 5.8503 5.2991 2.8076 4.7571 -0.1345
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1969 Liver -0.6873 -3.4580 -0.5125 -0.3566 -0.4921 3.0654 -4.0350 -1.5951 2.3337 ... 0.9493 -1.9942 5.2563 2.5924 -0.3752 4.5053 -4.6082 -2.2447 3.1458 -2.8262
1970 Lung 6.0830 -0.5756 4.3889 6.7302 4.6053 5.1065 2.8321 0.9716 3.2973 ... 5.8176 2.5437 7.7929 4.9012 2.7993 6.7510 5.2204 2.8422 5.0951 -0.3201
1975 Lung 3.7971 -1.9379 4.8555 6.4052 3.9561 5.4263 3.2959 4.5199 5.5589 ... 4.6697 6.5777 7.5114 5.2130 2.3816 6.6225 3.7389 3.7248 5.6809 0.8488
1982 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 5.2032 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1986 Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 7.7121 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

398 rows × 28 columns

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGA10']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[11], line 1
----> 1 from sklearn.model_selection import train_test_split
      2 from sklearn.linear_model import LogisticRegression
      3 from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'sklearn'
In [12]:
#swithc ITGA10 to ITGB4, and see how that impact its accurancy
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGB4']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 7
      4 y=data_liver_lung_expression_only['primary_site']
      6 #define split between train and test 
----> 7 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
      9 #define the model you want to use : logistic regression
     10 model = LogisticRegression()

NameError: name 'train_test_split' is not defined
In [13]:
!pip install scikit-learn
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
     --------------------------------------- 11.2/11.2 MB 38.5 MB/s eta 0:00:00
Collecting scipy>=1.6.0
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
     --------------------------------------- 46.2/46.2 MB 31.1 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.19.5 in c:\users\satvi\appdata\local\programs\python\python39\lib\site-packages (from scikit-learn) (2.0.2)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
     ------------------------------------- 308.4/308.4 KB 18.6 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.2 scikit-learn-1.6.1 scipy-1.13.1 threadpoolctl-3.6.0
WARNING: You are using pip version 22.0.4; however, version 25.2 is available.
You should consider upgrading via the 'C:\Users\satvi\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.
In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGA10']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 1.00
In [15]:
#swithc ITGA10 to ITGB4, and see how that impact its accurancy
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGB4']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")
Accuracy using ITGB4: 0.97
In [16]:
len(y_test)
Out[16]:
160
In [18]:
#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGB4']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGB4 expression')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [19]:
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")
Accuracy using ITGB4: 0.98
In [20]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Predict labels (not probabilities)
y_pred = model.predict(X_test)

# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGB4')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [21]:
#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [22]:
#trying different machine learning models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

#Xtain, ytrain defined earlier 
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site']

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train KNN model
knn = KNeighborsClassifier(n_neighbors=3)  # try 3 or 5
knn.fit(X_train, y_train)

# Step 5: Predict
y_pred = knn.predict(X_test)

# Step 6: Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Liver', 'Lung']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Brain', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('KNN Confusion Matrix (Liver vs Lung)')
plt.show()
✅ Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

       Liver       0.93      1.00      0.96        25
        Lung       1.00      0.98      0.99        95

    accuracy                           0.98       120
   macro avg       0.96      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120

No description has been provided for this image
In [25]:
# trying using two ITGs with KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# 1. Choose 2 integrins for 2D plot
X = data_liver_lung_expression_only[['ITGA10', 'ITGB1']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})

# 2. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# 4. Train KNN
#knn = KNeighborsClassifier(n_neighbors=5)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# 5. Create meshgrid for background prediction surface
h = 0.02  # grid step
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# 6. Predict over the mesh
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# 7. Plot
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='bwr')

# Plot training points
sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=y,
                palette={0: 'blue', 1: 'red'}, edgecolor='k', alpha=0.8)

plt.xlabel('ITGA10 (scaled)')
plt.ylabel('ITGB1 (scaled)')
plt.title('KNN Neighborhood and Decision Boundary')
plt.legend(title='True Label', labels=['Liver', 'Lung'])
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [26]:
y_pred = knn.predict(X_test)

# Step 6: Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Liver', 'Lung']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Liver', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('KNN Confusion Matrix (Liver vs Lung)')
plt.show()
✅ Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

       Liver       0.96      1.00      0.98        25
        Lung       1.00      0.99      0.99        95

    accuracy                           0.99       120
   macro avg       0.98      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120

No description has been provided for this image
In [27]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Predict class probabilities
y_proba = knn.predict_proba(X_test)[:, 1]  # probability of class 1 (Lung)

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc_score = roc_auc_score(y_test, y_proba)

# Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'KNN (AUC = {auc_score:.2f})', color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) - KNN')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [28]:
#get the model accuray for this ITGA10
#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
No description has been provided for this image
Accuracy using ITGA10: 1.00
In [29]:
#get confusion matrix and calculate your own accurancy, precision and sensitivity 
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Liver', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Liver vs Lung)')
plt.show()
No description has been provided for this image
In [30]:
from sklearn.metrics import roc_curve, accuracy_score

# After model is trained and you have predicted probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Lung class = 1
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Now find the threshold that gives highest accuracy
accuracies = []
for thresh in thresholds:
    y_pred_thresh = (y_proba >= thresh).astype(int)
    acc = accuracy_score(y_test, y_pred_thresh)
    accuracies.append(acc)

# Find best threshold
best_idx = np.argmax(accuracies)
best_threshold = thresholds[best_idx]
best_accuracy = accuracies[best_idx]

print(f"🎯 Best threshold: {best_threshold:.3f}")
print(f"✅ Highest accuracy at this threshold: {best_accuracy:.3f}")
🎯 Best threshold: 0.612
✅ Highest accuracy at this threshold: 1.000
In [31]:
a = model.coef_[0][0]
b = model.intercept_[0]
p_thresh = best_threshold  # the best probability threshold

# Invert the sigmoid function
ITGA10_thresh = (np.log(p_thresh / (1 - p_thresh)) - b) / a

print(f"🧬 ITGA10 expression threshold ≈ {ITGA10_thresh:.3f}")
🧬 ITGA10 expression threshold ≈ 1.876
In [32]:
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 1.00
In [33]:
from sklearn.metrics import accuracy_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

# Already computed:
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Compute accuracy at each threshold
accuracies = []
for thresh in thresholds:
    y_pred_thresh = (y_proba >= thresh).astype(int)
    acc = accuracy_score(y_test, y_pred_thresh)
    accuracies.append(acc)

# Plot accuracy vs threshold
plt.figure(figsize=(8, 5))
plt.plot(thresholds, accuracies, marker='o')
plt.xlabel("Probability Threshold")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Threshold (ITGA10)")
plt.grid(True)

# Mark the best threshold
best_idx = np.argmax(accuracies)
plt.axvline(thresholds[best_idx], color='red', linestyle='--', label=f"Best: {thresholds[best_idx]:.2f}")
plt.legend()

plt.tight_layout()
plt.show()
No description has been provided for this image
In [34]:
print("Model coefficient:", model.coef_)
print("Model intercept:", model.intercept_)
Model coefficient: [[1.6319934]]
Model intercept: [-2.60809743]
In [35]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Predict labels (not probabilities)
y_pred = model.predict(X_test)

# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [36]:
print("Model coefficient:", model.coef_)
print("Model intercept:", model.intercept_)
Model coefficient: [[1.6319934]]
Model intercept: [-2.60809743]
In [37]:
decision_scores = model.decision_function(X_test)
print(decision_scores)
[ 5.18662945  6.05240195  7.85787625  3.05704126  2.46120047  5.19087264
  5.80025897 -3.61879094  1.29660998  3.60898143  4.55357921 -1.17112724
  6.58851178  3.34068172  2.75398009  1.85311973 -5.00696453  4.10935061
 -4.33131926  3.9376649   2.68527317 -3.25110283  5.89377219  4.51147378
  5.41102855  7.31931843  6.44652836  5.31115055 -3.31393457 -2.58231193
 -3.72976649  5.35635677 -0.97496163  2.64006695  4.91832974 -2.33914492
  2.78515116  3.71734579  5.73514243  4.62440773  4.64725563  3.30853145
  2.44194295  2.82285021  5.59789179  3.83615491  3.78523672  4.0377061
  4.57707992  2.54965452 -4.81912209  5.34036323 -3.88660106  3.98238152
  4.96125116 -4.81912209 -1.18401999  8.60190204  2.62488941  6.56860146
 -2.19650869 -2.33914492  3.70771703  5.24015884 -2.85354924  2.9991055
  4.31710337  3.65728844  4.67614192  0.94409941  2.32329703  3.7157138
  5.54746319  1.87776283 -1.52967619 -1.60523748  5.37789908  2.29702194
  6.73979757  4.16973436  4.32950652 -2.67745715 -2.13792013  6.86072828
  2.64773732 -5.95841668  3.93178973 -3.25110283  5.07744909  5.32502249
  4.05712682 -4.58966382  7.0043437   1.9603417   5.56002954  0.45384859
  5.39536141  4.60025422  4.88438427  5.90209536  5.97129188  8.15489905
  1.52704745  7.30789447  0.52957308  7.90618326  6.87492663  4.37128555
  7.90128728  5.58124546  4.19992624  6.95864789  4.73309849  2.26748286
  5.69124181  5.42800128 -3.80695978  1.23296224  4.41698136  6.52519044]
In [39]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA3','ITGB4']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA3 and ITGB4expression')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Prepare features and target
selected_genes = ['ITGA10', 'ITGB4'] 
#X = integrins.iloc[:, -27:]  # Assuming the last 27 columns are integrins
X = integrins[selected_genes]  # using only selected gene; in this case just ITGA10 and ITGB4
y = integrins['primary_site']

# Step 2: Encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Train multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Accuracy: 0.7939698492462312

Classification Report:
              precision    recall  f1-score   support

 Bone Marrow       0.77      1.00      0.87        10
       Brain       0.81      0.94      0.87       247
      Breast       0.64      0.41      0.50        44
       Liver       1.00      0.65      0.79        23
        Lung       0.76      0.88      0.82        43
       Ovary       0.50      0.10      0.17        10
    Prostate       0.75      0.14      0.24        21

    accuracy                           0.79       398
   macro avg       0.75      0.59      0.61       398
weighted avg       0.78      0.79      0.77       398


Confusion Matrix:
[[ 10   0   0   0   0   0   0]
 [  3 231   3   0   8   1   1]
 [  0  25  18   0   1   0   0]
 [  0   8   0  15   0   0   0]
 [  0   4   1   0  38   0   0]
 [  0   6   0   0   3   1   0]
 [  0  12   6   0   0   0   3]]
C:\Users\satvi\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
  warnings.warn(
In [41]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
No description has been provided for this image
In [42]:
integrins['primary_site'].value_counts()
Out[42]:
primary_site
Brain          1152
Lung            288
Breast          179
Liver           110
Prostate        100
Ovary            88
Bone Marrow      70
Name: count, dtype: int64
In [43]:
integrins.shape
Out[43]:
(1987, 29)
In [44]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 0.79
In [45]:
data_liver_lung = integrins[integrins['primary_site'].isin(['Liver', 'Lung'])]     #filter data by organ, display both brain and lung data
data_liver_lung
Out[45]:
Unnamed: 0 primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
1 GTEX-1399S-1726-SM-5L3DI Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
3 GTEX-QXCU-0626-SM-2TC69 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
5 GTEX-11EI6-0826-SM-5985V Lung 6.0732 -2.4659 3.9901 7.3945 4.7688 5.1157 4.3356 2.3366 ... 3.7378 4.7247 7.5016 5.1396 2.5036 6.5443 4.6531 3.8136 5.8679 0.7407
6 GTEX-S341-0326-SM-2XCAU Lung 4.2510 -5.0116 3.3076 6.1715 3.1129 5.2954 2.2960 1.1184 ... 4.7104 2.7530 7.5022 4.0730 2.6325 6.0483 5.0562 2.6962 5.1611 0.9343
7 GTEX-WY7C-0426-SM-3NB3C Lung 3.3633 -2.5479 4.8340 6.6864 3.0585 4.8294 2.6464 0.7999 ... 5.1190 1.5013 8.0260 3.6635 3.2435 5.8503 5.2991 2.8076 4.7571 -0.1345
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1969 GTEX-13FTZ-0726-SM-5IFFY Liver -0.6873 -3.4580 -0.5125 -0.3566 -0.4921 3.0654 -4.0350 -1.5951 ... 0.9493 -1.9942 5.2563 2.5924 -0.3752 4.5053 -4.6082 -2.2447 3.1458 -2.8262
1970 GTEX-RWS6-0226-SM-2XCA9 Lung 6.0830 -0.5756 4.3889 6.7302 4.6053 5.1065 2.8321 0.9716 ... 5.8176 2.5437 7.7929 4.9012 2.7993 6.7510 5.2204 2.8422 5.0951 -0.3201
1975 GTEX-131XE-0726-SM-5HL9K Lung 3.7971 -1.9379 4.8555 6.4052 3.9561 5.4263 3.2959 4.5199 ... 4.6697 6.5777 7.5114 5.2130 2.3816 6.6225 3.7389 3.7248 5.6809 0.8488
1982 GTEX-QMRM-0826-SM-3NB33 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1986 GTEX-XV7Q-0426-SM-4BRVN Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

398 rows × 29 columns

In [46]:
data_liver_lung_expression_only=data_liver_lung.iloc[:,1:]
In [47]:
data_liver_lung_expression_only
Out[47]:
primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ITGA7 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
1 Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 3.9270 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
3 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 4.5355 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
5 Lung 6.0732 -2.4659 3.9901 7.3945 4.7688 5.1157 4.3356 2.3366 5.0527 ... 3.7378 4.7247 7.5016 5.1396 2.5036 6.5443 4.6531 3.8136 5.8679 0.7407
6 Lung 4.2510 -5.0116 3.3076 6.1715 3.1129 5.2954 2.2960 1.1184 5.2392 ... 4.7104 2.7530 7.5022 4.0730 2.6325 6.0483 5.0562 2.6962 5.1611 0.9343
7 Lung 3.3633 -2.5479 4.8340 6.6864 3.0585 4.8294 2.6464 0.7999 4.9246 ... 5.1190 1.5013 8.0260 3.6635 3.2435 5.8503 5.2991 2.8076 4.7571 -0.1345
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1969 Liver -0.6873 -3.4580 -0.5125 -0.3566 -0.4921 3.0654 -4.0350 -1.5951 2.3337 ... 0.9493 -1.9942 5.2563 2.5924 -0.3752 4.5053 -4.6082 -2.2447 3.1458 -2.8262
1970 Lung 6.0830 -0.5756 4.3889 6.7302 4.6053 5.1065 2.8321 0.9716 3.2973 ... 5.8176 2.5437 7.7929 4.9012 2.7993 6.7510 5.2204 2.8422 5.0951 -0.3201
1975 Lung 3.7971 -1.9379 4.8555 6.4052 3.9561 5.4263 3.2959 4.5199 5.5589 ... 4.6697 6.5777 7.5114 5.2130 2.3816 6.6225 3.7389 3.7248 5.6809 0.8488
1982 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 5.2032 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1986 Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 7.7121 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

398 rows × 28 columns

In [48]:
data_liver_lung_expression_only['primary_site'].value_counts()
Out[48]:
primary_site
Lung     288
Liver    110
Name: count, dtype: int64
In [49]:
liver_lung_vertical = data_liver_lung_expression_only.melt(id_vars = 'primary_site', var_name = 'integrin_gene', value_name = 'expression_levels')
In [50]:
plt.figure(figsize=(16, 6))
sns.violinplot(x = 'integrin_gene', y = 'expression_levels', hue = 'primary_site', data = liver_lung_vertical, split = True, inner = 'quartile')
plt.title("Integrin Genes of the Lung vs. the Liver")
plt.xlabel("Integrin Gene")
plt.ylabel("Gene Expression Levels")
plt.legend(title = 'primary_site')
plt.show()
No description has been provided for this image
In [51]:
data_liver_lung_expression_only
Out[51]:
primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ITGA7 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
1 Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 3.9270 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
3 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 4.5355 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
5 Lung 6.0732 -2.4659 3.9901 7.3945 4.7688 5.1157 4.3356 2.3366 5.0527 ... 3.7378 4.7247 7.5016 5.1396 2.5036 6.5443 4.6531 3.8136 5.8679 0.7407
6 Lung 4.2510 -5.0116 3.3076 6.1715 3.1129 5.2954 2.2960 1.1184 5.2392 ... 4.7104 2.7530 7.5022 4.0730 2.6325 6.0483 5.0562 2.6962 5.1611 0.9343
7 Lung 3.3633 -2.5479 4.8340 6.6864 3.0585 4.8294 2.6464 0.7999 4.9246 ... 5.1190 1.5013 8.0260 3.6635 3.2435 5.8503 5.2991 2.8076 4.7571 -0.1345
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1969 Liver -0.6873 -3.4580 -0.5125 -0.3566 -0.4921 3.0654 -4.0350 -1.5951 2.3337 ... 0.9493 -1.9942 5.2563 2.5924 -0.3752 4.5053 -4.6082 -2.2447 3.1458 -2.8262
1970 Lung 6.0830 -0.5756 4.3889 6.7302 4.6053 5.1065 2.8321 0.9716 3.2973 ... 5.8176 2.5437 7.7929 4.9012 2.7993 6.7510 5.2204 2.8422 5.0951 -0.3201
1975 Lung 3.7971 -1.9379 4.8555 6.4052 3.9561 5.4263 3.2959 4.5199 5.5589 ... 4.6697 6.5777 7.5114 5.2130 2.3816 6.6225 3.7389 3.7248 5.6809 0.8488
1982 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 5.2032 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1986 Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 7.7121 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

398 rows × 28 columns

In [52]:
X = data_liver_lung_expression_only['ITGA10']
X
Out[52]:
1       4.9137
3       4.0541
5       6.0732
6       4.2510
7       3.3633
         ...  
1969   -0.6873
1970    6.0830
1975    3.7971
1982    5.3067
1986    2.5585
Name: ITGA10, Length: 398, dtype: float64
In [53]:
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})
y
Out[53]:
1       1
3       1
5       1
6       1
7       1
       ..
1969    0
1970    1
1975    1
1982    1
1986    1
Name: primary_site, Length: 398, dtype: int64
In [54]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
#X = data_liver_lung_expression_only[['ITGA3','ITGB4']]  # 👈 Use your chosen integrin
X = data_liver_lung_expression_only[['ITGA10']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [57]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 1.00
In [56]:
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [58]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
#X = data_liver_lung_expression_only[['ITGA3','ITGB4']]  # 👈 Use your chosen integrin
X = data_liver_lung_expression_only[['ITGB7']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGB7 expression')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [59]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()
Accuracy: 0.81
No description has been provided for this image
In [ ]: