import pandas as pd                                      
import numpy as np

data=pd.read_excel(r"C:\Users\satvi\Downloads\gtex_integrin_7_organs.xlsx")

data

integrins=pd.read_excel(r"C:\Users\satvi\Downloads\gtex_integrin_7_organs.xlsx")

integrins

data_liver_lung = integrins[integrins['primary_site'].isin(['Liver', 'Lung'])]     #filter data by organ, display both brain and lung data
data_liver_lung

data_liver_lung_expression_only=data_liver_lung.iloc[:,1:]

data_liver_lung_expression_only

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGA10']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[11], line 1
----> 1 from sklearn.model_selection import train_test_split
      2 from sklearn.linear_model import LogisticRegression
      3 from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'sklearn'

#swithc ITGA10 to ITGB4, and see how that impact its accurancy
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGB4']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 7
      4 y=data_liver_lung_expression_only['primary_site']
      6 #define split between train and test 
----> 7 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
      9 #define the model you want to use : logistic regression
     10 model = LogisticRegression()

NameError: name 'train_test_split' is not defined

!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
     --------------------------------------- 11.2/11.2 MB 38.5 MB/s eta 0:00:00
Collecting scipy>=1.6.0
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
     --------------------------------------- 46.2/46.2 MB 31.1 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.19.5 in c:\users\satvi\appdata\local\programs\python\python39\lib\site-packages (from scikit-learn) (2.0.2)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
     ------------------------------------- 308.4/308.4 KB 18.6 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.2 scikit-learn-1.6.1 scipy-1.13.1 threadpoolctl-3.6.0

WARNING: You are using pip version 22.0.4; however, version 25.2 is available.
You should consider upgrading via the 'C:\Users\satvi\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGA10']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")

Accuracy using ITGA10: 1.00

#swithc ITGA10 to ITGB4, and see how that impact its accurancy
#define wwhat is X and what is Y in your model
X=data_liver_lung_expression_only[['ITGB4']]
y=data_liver_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")

Accuracy using ITGB4: 0.97

len(y_test)

160

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGB4']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGB4 expression')
plt.legend()
plt.grid(True)
plt.show()

y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")

Accuracy using ITGB4: 0.98

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Predict labels (not probabilities)
y_pred = model.predict(X_test)

# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGB4')
plt.tight_layout()
plt.show()

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()

#trying different machine learning models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

#Xtain, ytrain defined earlier 
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site']

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train KNN model
knn = KNeighborsClassifier(n_neighbors=3)  # try 3 or 5
knn.fit(X_train, y_train)

# Step 5: Predict
y_pred = knn.predict(X_test)

# Step 6: Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Liver', 'Lung']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Brain', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('KNN Confusion Matrix (Liver vs Lung)')
plt.show()

✅ Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

       Liver       0.93      1.00      0.96        25
        Lung       1.00      0.98      0.99        95

    accuracy                           0.98       120
   macro avg       0.96      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120

# trying using two ITGs with KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# 1. Choose 2 integrins for 2D plot
X = data_liver_lung_expression_only[['ITGA10', 'ITGB1']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})

# 2. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# 4. Train KNN
#knn = KNeighborsClassifier(n_neighbors=5)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# 5. Create meshgrid for background prediction surface
h = 0.02  # grid step
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# 6. Predict over the mesh
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# 7. Plot
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='bwr')

# Plot training points
sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=y,
                palette={0: 'blue', 1: 'red'}, edgecolor='k', alpha=0.8)

plt.xlabel('ITGA10 (scaled)')
plt.ylabel('ITGB1 (scaled)')
plt.title('KNN Neighborhood and Decision Boundary')
plt.legend(title='True Label', labels=['Liver', 'Lung'])
plt.grid(True)
plt.tight_layout()
plt.show()

y_pred = knn.predict(X_test)

# Step 6: Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Liver', 'Lung']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Liver', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('KNN Confusion Matrix (Liver vs Lung)')
plt.show()

✅ Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

       Liver       0.96      1.00      0.98        25
        Lung       1.00      0.99      0.99        95

    accuracy                           0.99       120
   macro avg       0.98      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120

from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Predict class probabilities
y_proba = knn.predict_proba(X_test)[:, 1]  # probability of class 1 (Lung)

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc_score = roc_auc_score(y_test, y_proba)

# Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'KNN (AUC = {auc_score:.2f})', color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) - KNN')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#get the model accuray for this ITGA10
#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")

Accuracy using ITGA10: 1.00

#get confusion matrix and calculate your own accurancy, precision and sensitivity 
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Liver', 'Lung'], yticklabels=['Liver', 'Lung'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Liver vs Lung)')
plt.show()

from sklearn.metrics import roc_curve, accuracy_score

# After model is trained and you have predicted probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Lung class = 1
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Now find the threshold that gives highest accuracy
accuracies = []
for thresh in thresholds:
    y_pred_thresh = (y_proba >= thresh).astype(int)
    acc = accuracy_score(y_test, y_pred_thresh)
    accuracies.append(acc)

# Find best threshold
best_idx = np.argmax(accuracies)
best_threshold = thresholds[best_idx]
best_accuracy = accuracies[best_idx]

print(f"🎯 Best threshold: {best_threshold:.3f}")
print(f"✅ Highest accuracy at this threshold: {best_accuracy:.3f}")

🎯 Best threshold: 0.612
✅ Highest accuracy at this threshold: 1.000

a = model.coef_[0][0]
b = model.intercept_[0]
p_thresh = best_threshold  # the best probability threshold

# Invert the sigmoid function
ITGA10_thresh = (np.log(p_thresh / (1 - p_thresh)) - b) / a

print(f"🧬 ITGA10 expression threshold ≈ {ITGA10_thresh:.3f}")

🧬 ITGA10 expression threshold ≈ 1.876

y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")

Accuracy using ITGA10: 1.00

from sklearn.metrics import accuracy_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

# Already computed:
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Compute accuracy at each threshold
accuracies = []
for thresh in thresholds:
    y_pred_thresh = (y_proba >= thresh).astype(int)
    acc = accuracy_score(y_test, y_pred_thresh)
    accuracies.append(acc)

# Plot accuracy vs threshold
plt.figure(figsize=(8, 5))
plt.plot(thresholds, accuracies, marker='o')
plt.xlabel("Probability Threshold")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Threshold (ITGA10)")
plt.grid(True)

# Mark the best threshold
best_idx = np.argmax(accuracies)
plt.axvline(thresholds[best_idx], color='red', linestyle='--', label=f"Best: {thresholds[best_idx]:.2f}")
plt.legend()

plt.tight_layout()
plt.show()

print("Model coefficient:", model.coef_)
print("Model intercept:", model.intercept_)

Model coefficient: [[1.6319934]]
Model intercept: [-2.60809743]

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Predict labels (not probabilities)
y_pred = model.predict(X_test)

# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()

print("Model coefficient:", model.coef_)
print("Model intercept:", model.intercept_)

Model coefficient: [[1.6319934]]
Model intercept: [-2.60809743]

decision_scores = model.decision_function(X_test)
print(decision_scores)

[ 5.18662945  6.05240195  7.85787625  3.05704126  2.46120047  5.19087264
  5.80025897 -3.61879094  1.29660998  3.60898143  4.55357921 -1.17112724
  6.58851178  3.34068172  2.75398009  1.85311973 -5.00696453  4.10935061
 -4.33131926  3.9376649   2.68527317 -3.25110283  5.89377219  4.51147378
  5.41102855  7.31931843  6.44652836  5.31115055 -3.31393457 -2.58231193
 -3.72976649  5.35635677 -0.97496163  2.64006695  4.91832974 -2.33914492
  2.78515116  3.71734579  5.73514243  4.62440773  4.64725563  3.30853145
  2.44194295  2.82285021  5.59789179  3.83615491  3.78523672  4.0377061
  4.57707992  2.54965452 -4.81912209  5.34036323 -3.88660106  3.98238152
  4.96125116 -4.81912209 -1.18401999  8.60190204  2.62488941  6.56860146
 -2.19650869 -2.33914492  3.70771703  5.24015884 -2.85354924  2.9991055
  4.31710337  3.65728844  4.67614192  0.94409941  2.32329703  3.7157138
  5.54746319  1.87776283 -1.52967619 -1.60523748  5.37789908  2.29702194
  6.73979757  4.16973436  4.32950652 -2.67745715 -2.13792013  6.86072828
  2.64773732 -5.95841668  3.93178973 -3.25110283  5.07744909  5.32502249
  4.05712682 -4.58966382  7.0043437   1.9603417   5.56002954  0.45384859
  5.39536141  4.60025422  4.88438427  5.90209536  5.97129188  8.15489905
  1.52704745  7.30789447  0.52957308  7.90618326  6.87492663  4.37128555
  7.90128728  5.58124546  4.19992624  6.95864789  4.73309849  2.26748286
  5.69124181  5.42800128 -3.80695978  1.23296224  4.41698136  6.52519044]

from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_liver_lung_expression_only[['ITGA3','ITGB4']]  # 👈 Use your chosen integrin
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA3 and ITGB4expression')
plt.legend()
plt.grid(True)
plt.show()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Prepare features and target
selected_genes = ['ITGA10', 'ITGB4'] 
#X = integrins.iloc[:, -27:]  # Assuming the last 27 columns are integrins
X = integrins[selected_genes]  # using only selected gene; in this case just ITGA10 and ITGB4
y = integrins['primary_site']

# Step 2: Encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Train multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7939698492462312

Classification Report:
              precision    recall  f1-score   support

 Bone Marrow       0.77      1.00      0.87        10
       Brain       0.81      0.94      0.87       247
      Breast       0.64      0.41      0.50        44
       Liver       1.00      0.65      0.79        23
        Lung       0.76      0.88      0.82        43
       Ovary       0.50      0.10      0.17        10
    Prostate       0.75      0.14      0.24        21

    accuracy                           0.79       398
   macro avg       0.75      0.59      0.61       398
weighted avg       0.78      0.79      0.77       398


Confusion Matrix:
[[ 10   0   0   0   0   0   0]
 [  3 231   3   0   8   1   1]
 [  0  25  18   0   1   0   0]
 [  0   8   0  15   0   0   0]
 [  0   4   1   0  38   0   0]
 [  0   6   0   0   3   1   0]
 [  0  12   6   0   0   0   3]]

C:\Users\satvi\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
  warnings.warn(

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

integrins['primary_site'].value_counts()

primary_site
Brain          1152
Lung            288
Breast          179
Liver           110
Prostate        100
Ovary            88
Bone Marrow      70
Name: count, dtype: int64

integrins.shape

(1987, 29)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.79

data_liver_lung = integrins[integrins['primary_site'].isin(['Liver', 'Lung'])]     #filter data by organ, display both brain and lung data
data_liver_lung

data_liver_lung_expression_only=data_liver_lung.iloc[:,1:]

data_liver_lung_expression_only

data_liver_lung_expression_only['primary_site'].value_counts()

primary_site
Lung     288
Liver    110
Name: count, dtype: int64

liver_lung_vertical = data_liver_lung_expression_only.melt(id_vars = 'primary_site', var_name = 'integrin_gene', value_name = 'expression_levels')

plt.figure(figsize=(16, 6))
sns.violinplot(x = 'integrin_gene', y = 'expression_levels', hue = 'primary_site', data = liver_lung_vertical, split = True, inner = 'quartile')
plt.title("Integrin Genes of the Lung vs. the Liver")
plt.xlabel("Integrin Gene")
plt.ylabel("Gene Expression Levels")
plt.legend(title = 'primary_site')
plt.show()

data_liver_lung_expression_only

X = data_liver_lung_expression_only['ITGA10']
X

1       4.9137
3       4.0541
5       6.0732
6       4.2510
7       3.3633
         ...  
1969   -0.6873
1970    6.0830
1975    3.7971
1982    5.3067
1986    2.5585
Name: ITGA10, Length: 398, dtype: float64

y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})
y

1       1
3       1
5       1
6       1
7       1
       ..
1969    0
1970    1
1975    1
1982    1
1986    1
Name: primary_site, Length: 398, dtype: int64

from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
#X = data_liver_lung_expression_only[['ITGA3','ITGB4']]  # 👈 Use your chosen integrin
X = data_liver_lung_expression_only[['ITGA10']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00

cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()

from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
#X = data_liver_lung_expression_only[['ITGA3','ITGB4']]  # 👈 Use your chosen integrin
X = data_liver_lung_expression_only[['ITGB7']]
y = data_liver_lung_expression_only['primary_site'].map({'Liver': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Liver vs Lung) using ITGB7 expression')
plt.legend()
plt.grid(True)
plt.show()

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Step 2: Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Liver', 'Lung']  # same order as your encoded classes (0 and 1)

# Step 3: Plot with seaborn heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Liver vs Lung) Using ITGA10')
plt.tight_layout()
plt.show()

Accuracy: 0.81

	Unnamed: 0	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
0	GTEX-13QIC-0011-R1a-SM-5O9CJ	Brain	0.5763	-6.5064	2.2573	0.7832	1.0363	4.6035	2.5731	-2.8262	...	2.8562	1.3846	5.8430	1.1316	-0.7108	3.5387	-0.0725	-0.4521	0.2029	-2.8262
1	GTEX-1399S-1726-SM-5L3DI	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
2	GTEX-PWCY-1326-SM-48TCU	Ovary	2.3953	-5.0116	1.4547	4.2593	-0.7346	4.4149	0.2642	1.5216	...	3.6816	1.5465	7.2964	-0.9406	2.7742	5.0414	2.0325	0.7579	2.2573	1.2516
3	GTEX-QXCU-0626-SM-2TC69	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
4	GTEX-ZA64-1526-SM-5CVMD	Breast	2.0569	-2.4659	3.3993	3.1311	3.0074	4.4977	-1.7809	2.7139	...	4.7340	0.6332	7.3496	-0.9406	2.5338	6.5696	1.7229	-0.6416	3.1195	1.1050
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1982	GTEX-QMRM-0826-SM-3NB33	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1983	GTEX-YFCO-1626-SM-4W1Z3	Prostate	2.9581	-4.6082	1.1641	4.6938	1.5902	5.8625	-0.5125	1.7617	...	3.8798	-1.4699	7.5163	-0.3752	2.9562	5.3035	4.4304	-0.9406	3.6136	0.4233
1984	GTEX-1117F-2826-SM-5GZXL	Breast	4.3184	-6.5064	1.0433	4.8440	3.5498	4.6809	1.0293	3.3478	...	5.3256	-0.0725	7.7516	1.1382	2.1411	7.1132	0.3796	0.0854	3.8650	1.0151
1985	GTEX-Q2AG-2826-SM-2HMJQ	Brain	3.4622	-5.5735	1.5013	5.4835	1.7702	4.7517	0.6790	-3.1714	...	1.1960	4.1740	4.3002	0.5470	-0.9971	3.7982	-0.2498	1.4808	-0.5125	-0.5125
1986	GTEX-XV7Q-0426-SM-4BRVN	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

	Unnamed: 0	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
0	GTEX-13QIC-0011-R1a-SM-5O9CJ	Brain	0.5763	-6.5064	2.2573	0.7832	1.0363	4.6035	2.5731	-2.8262	...	2.8562	1.3846	5.8430	1.1316	-0.7108	3.5387	-0.0725	-0.4521	0.2029	-2.8262
1	GTEX-1399S-1726-SM-5L3DI	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
2	GTEX-PWCY-1326-SM-48TCU	Ovary	2.3953	-5.0116	1.4547	4.2593	-0.7346	4.4149	0.2642	1.5216	...	3.6816	1.5465	7.2964	-0.9406	2.7742	5.0414	2.0325	0.7579	2.2573	1.2516
3	GTEX-QXCU-0626-SM-2TC69	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
4	GTEX-ZA64-1526-SM-5CVMD	Breast	2.0569	-2.4659	3.3993	3.1311	3.0074	4.4977	-1.7809	2.7139	...	4.7340	0.6332	7.3496	-0.9406	2.5338	6.5696	1.7229	-0.6416	3.1195	1.1050
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1982	GTEX-QMRM-0826-SM-3NB33	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1983	GTEX-YFCO-1626-SM-4W1Z3	Prostate	2.9581	-4.6082	1.1641	4.6938	1.5902	5.8625	-0.5125	1.7617	...	3.8798	-1.4699	7.5163	-0.3752	2.9562	5.3035	4.4304	-0.9406	3.6136	0.4233
1984	GTEX-1117F-2826-SM-5GZXL	Breast	4.3184	-6.5064	1.0433	4.8440	3.5498	4.6809	1.0293	3.3478	...	5.3256	-0.0725	7.7516	1.1382	2.1411	7.1132	0.3796	0.0854	3.8650	1.0151
1985	GTEX-Q2AG-2826-SM-2HMJQ	Brain	3.4622	-5.5735	1.5013	5.4835	1.7702	4.7517	0.6790	-3.1714	...	1.1960	4.1740	4.3002	0.5470	-0.9971	3.7982	-0.2498	1.4808	-0.5125	-0.5125
1986	GTEX-XV7Q-0426-SM-4BRVN	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

	Unnamed: 0	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
1	GTEX-1399S-1726-SM-5L3DI	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
3	GTEX-QXCU-0626-SM-2TC69	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
5	GTEX-11EI6-0826-SM-5985V	Lung	6.0732	-2.4659	3.9901	7.3945	4.7688	5.1157	4.3356	2.3366	...	3.7378	4.7247	7.5016	5.1396	2.5036	6.5443	4.6531	3.8136	5.8679	0.7407
6	GTEX-S341-0326-SM-2XCAU	Lung	4.2510	-5.0116	3.3076	6.1715	3.1129	5.2954	2.2960	1.1184	...	4.7104	2.7530	7.5022	4.0730	2.6325	6.0483	5.0562	2.6962	5.1611	0.9343
7	GTEX-WY7C-0426-SM-3NB3C	Lung	3.3633	-2.5479	4.8340	6.6864	3.0585	4.8294	2.6464	0.7999	...	5.1190	1.5013	8.0260	3.6635	3.2435	5.8503	5.2991	2.8076	4.7571	-0.1345
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1969	GTEX-13FTZ-0726-SM-5IFFY	Liver	-0.6873	-3.4580	-0.5125	-0.3566	-0.4921	3.0654	-4.0350	-1.5951	...	0.9493	-1.9942	5.2563	2.5924	-0.3752	4.5053	-4.6082	-2.2447	3.1458	-2.8262
1970	GTEX-RWS6-0226-SM-2XCA9	Lung	6.0830	-0.5756	4.3889	6.7302	4.6053	5.1065	2.8321	0.9716	...	5.8176	2.5437	7.7929	4.9012	2.7993	6.7510	5.2204	2.8422	5.0951	-0.3201
1975	GTEX-131XE-0726-SM-5HL9K	Lung	3.7971	-1.9379	4.8555	6.4052	3.9561	5.4263	3.2959	4.5199	...	4.6697	6.5777	7.5114	5.2130	2.3816	6.6225	3.7389	3.7248	5.6809	0.8488
1982	GTEX-QMRM-0826-SM-3NB33	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1986	GTEX-XV7Q-0426-SM-4BRVN	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	ITGA7	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
1	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	3.9270	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
3	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	4.5355	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
5	Lung	6.0732	-2.4659	3.9901	7.3945	4.7688	5.1157	4.3356	2.3366	5.0527	...	3.7378	4.7247	7.5016	5.1396	2.5036	6.5443	4.6531	3.8136	5.8679	0.7407
6	Lung	4.2510	-5.0116	3.3076	6.1715	3.1129	5.2954	2.2960	1.1184	5.2392	...	4.7104	2.7530	7.5022	4.0730	2.6325	6.0483	5.0562	2.6962	5.1611	0.9343
7	Lung	3.3633	-2.5479	4.8340	6.6864	3.0585	4.8294	2.6464	0.7999	4.9246	...	5.1190	1.5013	8.0260	3.6635	3.2435	5.8503	5.2991	2.8076	4.7571	-0.1345
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1969	Liver	-0.6873	-3.4580	-0.5125	-0.3566	-0.4921	3.0654	-4.0350	-1.5951	2.3337	...	0.9493	-1.9942	5.2563	2.5924	-0.3752	4.5053	-4.6082	-2.2447	3.1458	-2.8262
1970	Lung	6.0830	-0.5756	4.3889	6.7302	4.6053	5.1065	2.8321	0.9716	3.2973	...	5.8176	2.5437	7.7929	4.9012	2.7993	6.7510	5.2204	2.8422	5.0951	-0.3201
1975	Lung	3.7971	-1.9379	4.8555	6.4052	3.9561	5.4263	3.2959	4.5199	5.5589	...	4.6697	6.5777	7.5114	5.2130	2.3816	6.6225	3.7389	3.7248	5.6809	0.8488
1982	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	5.2032	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1986	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	7.7121	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

	Unnamed: 0	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
1	GTEX-1399S-1726-SM-5L3DI	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
3	GTEX-QXCU-0626-SM-2TC69	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
5	GTEX-11EI6-0826-SM-5985V	Lung	6.0732	-2.4659	3.9901	7.3945	4.7688	5.1157	4.3356	2.3366	...	3.7378	4.7247	7.5016	5.1396	2.5036	6.5443	4.6531	3.8136	5.8679	0.7407
6	GTEX-S341-0326-SM-2XCAU	Lung	4.2510	-5.0116	3.3076	6.1715	3.1129	5.2954	2.2960	1.1184	...	4.7104	2.7530	7.5022	4.0730	2.6325	6.0483	5.0562	2.6962	5.1611	0.9343
7	GTEX-WY7C-0426-SM-3NB3C	Lung	3.3633	-2.5479	4.8340	6.6864	3.0585	4.8294	2.6464	0.7999	...	5.1190	1.5013	8.0260	3.6635	3.2435	5.8503	5.2991	2.8076	4.7571	-0.1345
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1969	GTEX-13FTZ-0726-SM-5IFFY	Liver	-0.6873	-3.4580	-0.5125	-0.3566	-0.4921	3.0654	-4.0350	-1.5951	...	0.9493	-1.9942	5.2563	2.5924	-0.3752	4.5053	-4.6082	-2.2447	3.1458	-2.8262
1970	GTEX-RWS6-0226-SM-2XCA9	Lung	6.0830	-0.5756	4.3889	6.7302	4.6053	5.1065	2.8321	0.9716	...	5.8176	2.5437	7.7929	4.9012	2.7993	6.7510	5.2204	2.8422	5.0951	-0.3201
1975	GTEX-131XE-0726-SM-5HL9K	Lung	3.7971	-1.9379	4.8555	6.4052	3.9561	5.4263	3.2959	4.5199	...	4.6697	6.5777	7.5114	5.2130	2.3816	6.6225	3.7389	3.7248	5.6809	0.8488
1982	GTEX-QMRM-0826-SM-3NB33	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1986	GTEX-XV7Q-0426-SM-4BRVN	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007