In [1]:
import cv2
import numpy as np
import os
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt

Load and Process Images¶

In [2]:
folder_path = "images/EasyDiffusion-Images-LRI758YQ/"

Feature Extraction¶

In [3]:
from keras.applications.vgg19 import VGG19

# Load the ResNet50 model
model = VGG19(weights='imagenet')

# Print all layer names
for i, layer in enumerate(model.layers):
    print(i, layer.name)
0 input_1
1 block1_conv1
2 block1_conv2
3 block1_pool
4 block2_conv1
5 block2_conv2
6 block2_pool
7 block3_conv1
8 block3_conv2
9 block3_conv3
10 block3_conv4
11 block3_pool
12 block4_conv1
13 block4_conv2
14 block4_conv3
15 block4_conv4
16 block4_pool
17 block5_conv1
18 block5_conv2
19 block5_conv3
20 block5_conv4
21 block5_pool
22 flatten
23 fc1
24 fc2
25 predictions
In [4]:
from keras.applications.resnet_v2 import ResNet50V2, preprocess_input

from keras.preprocessing import image
from keras.models import Model
import numpy as np
from scipy.spatial.distance import cdist

model = ResNet50V2(
    include_top=True,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000,
    classifier_activation="softmax"
)

model = Model(inputs=model.input, outputs=model.output)

# experimental feature to test individual layers of network
# layer_name = 'conv5_block2_out'
# intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)

def extract_features(img_path, model):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)
    return features.flatten()
In [5]:
# model = model_res
model_name = "ResNet50V2"

features_list = []
filenames = []

for filename in os.listdir(folder_path):
    img_path = os.path.join(folder_path, filename)
    if not img_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        print(f"Skipping non-image file: {img_path}")
        continue
    features = extract_features(img_path, model)
    if features is not None:
        features_list.append(features)
        filenames.append(filename)
1/1 [==============================] - 1s 592ms/step
1/1 [==============================] - 0s 52ms/step
...
1/1 [==============================] - 0s 49ms/step
1/1 [==============================] - 0s 47ms/step
1/1 [==============================] - 0s 47ms/step
In [6]:
type(features_list)
Out[6]:
list
In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert features_list to a DataFrame for easier manipulation
df_features = pd.DataFrame(features_list)

# Histogram for the first feature
plt.figure(figsize=(10, 6))
sns.histplot(df_features[0], kde=True)
plt.title('Distribution of Feature 1')
plt.xlabel('Feature 1 Value')
plt.ylabel('Frequency')
plt.show()

# Pair plot for the first few features (this can be computationally intensive for large datasets)
sns.pairplot(df_features.iloc[:, :5])  # Adjust number of features as needed
plt.show()
In [8]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Convert features_list to a NumPy array
features_array = np.array(features_list)

# Apply t-SNE to the array
tsne = TSNE(n_components=2, random_state=42)
features_reduced = tsne.fit_transform(features_array)

# Plot the reduced features
plt.figure(figsize=(10, 8))
plt.scatter(features_reduced[:, 0], features_reduced[:, 1])
plt.title('t-SNE of Features')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()
In [9]:
from sklearn.cluster import KMeans

# Example: Use KMeans to cluster the features
kmeans = KMeans(n_clusters=6, random_state=42)
clusters = kmeans.fit_predict(features_list)

# Visualize the clusters (if dimensionality reduction has been applied)
plt.scatter(features_reduced[:, 0], features_reduced[:, 1], c=clusters)
plt.title('Features Clustering with t-SNE')
plt.show()

Plot Dendrogram¶

In [10]:
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import matplotlib.pyplot as plt

# Perform hierarchical/agglomerative clustering
Z = linkage(features_list, method='ward')  # 'ward' minimizes the variance of clusters being merged.

# Plot dendrogram to help decide the number of clusters
plt.figure(figsize=(25, 10))
dendrogram(Z)
plt.show()
In [11]:
desired_clusters = 12
clusters = fcluster(Z, desired_clusters, criterion='maxclust')

print(f"Formed {max(clusters)} clusters.")
Formed 12 clusters.
In [13]:
# Step 1: Pre-compute cluster centroids
cluster_centroids = {}
for i in range(1, max(clusters) + 1):
    cluster_features = np.array([features_list[idx] for idx, cluster_id in enumerate(clusters) if cluster_id == i])
    centroid = np.mean(cluster_features, axis=0)
    cluster_centroids[i] = centroid

# Step 2: Determine the order of clusters (example using overall centroid)
overall_centroid = np.mean(list(cluster_centroids.values()), axis=0)
cluster_distances = {i: np.linalg.norm(centroid - overall_centroid) for i, centroid in cluster_centroids.items()}
sorted_cluster_indices = sorted(cluster_distances, key=cluster_distances.get)  # Sort cluster IDs by their distance to the overall centroid
In [14]:
from scipy.spatial.distance import cdist

sorted_filenames = []
for cluster_id in sorted_cluster_indices:
    indices = [idx for idx, cluster_id_enum in enumerate(clusters) if cluster_id_enum == cluster_id]
    cluster_features = np.array([features_list[idx] for idx in indices])
    
    # Calculate distances to the cluster's centroid
    distances = np.linalg.norm(cluster_features - cluster_centroids[cluster_id], axis=1)
    
    # Sort indices within the cluster based on distance to centroid
    sorted_indices_in_cluster = np.argsort(distances)
    
    # Append sorted filenames for this cluster
    sorted_filenames += [filenames[indices[idx]] for idx in sorted_indices_in_cluster]
In [ ]:
import matplotlib.pyplot as plt
import cv2
import numpy as np
import os

def show_clusters_overview(clusters, sorted_filenames, folder_path, images_per_cluster=5):
    unique_clusters = np.unique(clusters)
    num_clusters = len(unique_clusters)
    
    # Calculate the number of rows needed for the subplot
    rows = num_clusters
    
    plt.figure(figsize=(15, num_clusters * 3))  # Adjust the figure size as needed

    current_image = 1
    for cluster_id in unique_clusters:
        # Get filenames for the current cluster
        cluster_filenames = [filename for idx, filename in enumerate(sorted_filenames) if clusters[idx] == cluster_id][:images_per_cluster]
        
        for filename in cluster_filenames:
            # Load and plot each image in the cluster
            img_path = os.path.join(folder_path, filename)
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            plt.subplot(rows, images_per_cluster, current_image)
            plt.imshow(img)
            plt.title(f"Cluster {cluster_id}")
            plt.axis('off')
            
            current_image += 1

    plt.tight_layout()
    plt.show()
In [ ]:
show_clusters_overview(clusters, sorted_filenames, folder_path, images_per_cluster=12)

Render to mov sequence¶

In [ ]:
from moviepy.editor import ImageSequenceClip

# Parameters
fps = 25
image_duration = 1 / fps  # Duration each image should display

sorted_full_paths = [os.path.join(folder_path, filename) for filename in sorted_filenames]
clip = ImageSequenceClip(sorted_full_paths, fps=fps)
clip = clip.set_duration(image_duration * len(sorted_full_paths))

# Define the output filename
base_filename = "images/output/EasyDiffusion-Images-LRI758YQ_sorted_sequence"
extension = ".mov"
i = 1
current_filename = f"{base_filename}_{desired_clusters}_{i}{extension}"

while os.path.exists(current_filename):
    i += 1
    current_filename = f"{base_filename}_{desired_clusters}_{i}{extension}"

# Write the video file
clip.write_videofile(current_filename, codec="libx264")
In [ ]: