diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..66f8497f61c6d90d3faadafe0b306301dd6e0654
--- /dev/null
+++ b/main.py
@@ -0,0 +1,269 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans, DBSCAN
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.preprocessing import normalize
+from sklearn.manifold import TSNE
+import re
+import os
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings("ignore")
+
+# File path - update with your actual file path
+file_path = "/Users/awa01/Downloads/data/validation.csv"
+
+def preprocess_code(code):
+    """
+    Preprocess the source code to improve clustering quality
+    """
+    if not isinstance(code, str):
+        return ""
+    
+    # Convert to lowercase
+    code = code.lower()
+    
+    # Remove comments (both // and /* ... */ style)
+    code = re.sub(r'//.*?(\n|$)', ' ', code)  # Remove single line comments
+    code = re.sub(r'/\*.*?\*/', ' ', code, flags=re.DOTALL)  # Remove multi-line comments
+    
+    # Remove string literals
+    code = re.sub(r'".*?"', 'STRING_LITERAL', code)
+    code = re.sub(r"'.*?'", 'STRING_LITERAL', code)
+    
+    # Remove numbers
+    code = re.sub(r'\b\d+\b', 'NUMBER', code)
+    
+    # Remove extra whitespace
+    code = re.sub(r'\s+', ' ', code).strip()
+    
+    return code
+
+def cluster_contracts():
+    """
+    Load and cluster Ethereum contracts based on source code
+    """
+    print("Loading Ethereum contracts...")
+    try:
+        df = pd.read_csv(file_path)
+    except Exception as e:
+        print(f"Error reading CSV file: {e}")
+        return
+    
+    # Basic data information
+    print(f"Loaded {len(df)} contracts")
+    
+    # Check if source_code column exists
+    if 'source_code' not in df.columns:
+        print("Error: 'source_code' column not found in the CSV file.")
+        print(f"Available columns: {df.columns.tolist()}")
+        return
+    
+    # Remove any rows with null source code
+    df = df.dropna(subset=['source_code'])
+    print(f"Contracts with non-null source code: {len(df)}")
+    
+    # Preprocess source codes
+    print("Preprocessing source code...")
+    df['processed_code'] = df['source_code'].apply(preprocess_code)
+    
+    # Filter out empty processed code
+    df = df[df['processed_code'].str.len() > 0]
+    print(f"Contracts after preprocessing: {len(df)}")
+    
+    # Extract features using TF-IDF
+    print("Extracting features using TF-IDF...")
+    tfidf_vectorizer = TfidfVectorizer(
+        max_features=5000,  # Limit features to reduce dimensionality
+        ngram_range=(1, 2),  # Use unigrams and bigrams
+        stop_words='english',
+        min_df=5,  # Ignore terms that appear in less than 5 documents
+        max_df=0.8  # Ignore terms that appear in more than 80% of documents
+    )
+    
+    # Create the TF-IDF matrix
+    tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_code'])
+    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
+    
+    # Dimensionality reduction for visualization and to improve clustering
+    print("Performing dimensionality reduction...")
+    n_components = min(100, tfidf_matrix.shape[0] - 1, tfidf_matrix.shape[1] - 1)
+    svd = TruncatedSVD(n_components=n_components, random_state=42)
+    reduced_features = svd.fit_transform(tfidf_matrix)
+    
+    # Normalize the reduced features
+    normalized_features = normalize(reduced_features)
+    print(f"Reduced features shape: {normalized_features.shape}")
+    
+    # Determine optimal number of clusters using elbow method
+    print("Determining optimal number of clusters...")
+    max_clusters = min(20, len(df) - 1)  # Limit to 20 clusters or less
+    inertias = []
+    
+    for k in range(2, max_clusters + 1):
+        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+        kmeans.fit(normalized_features)
+        inertias.append(kmeans.inertia_)
+    
+    # Plot elbow curve
+    plt.figure(figsize=(10, 6))
+    plt.plot(range(2, max_clusters + 1), inertias, 'o-')
+    plt.xlabel('Number of Clusters')
+    plt.ylabel('Inertia')
+    plt.title('Elbow Method for Optimal Number of Clusters')
+    plt.grid(True)
+    plt.savefig('elbow_curve.png')
+    
+    # Find the elbow point (this is a simple heuristic)
+    differences = np.diff(inertias)
+    elbow_point = np.argmin(differences) + 2  # +2 because we started from 2 clusters
+    print(f"Estimated optimal number of clusters: {elbow_point}")
+    
+    # Perform K-means clustering with the optimal number of clusters
+    print(f"Performing K-means clustering with {elbow_point} clusters...")
+    kmeans = KMeans(n_clusters=20, random_state=42, n_init=10)
+    df['cluster'] = kmeans.fit_predict(normalized_features)
+    
+    # Alternative: Try DBSCAN clustering
+    print("Also trying DBSCAN clustering...")
+    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
+    df['dbscan_cluster'] = dbscan.fit_predict(normalized_features)
+    
+    # Count contracts in each cluster
+    kmeans_cluster_counts = df['cluster'].value_counts().sort_index()
+    print("\nK-means Cluster Distribution:")
+    for cluster, count in kmeans_cluster_counts.items():
+        print(f"Cluster {cluster}: {count} contracts ({count/len(df):.1%})")
+    
+    dbscan_cluster_counts = df['dbscan_cluster'].value_counts().sort_index()
+    print("\nDBSCAN Cluster Distribution:")
+    for cluster, count in dbscan_cluster_counts.items():
+        print(f"Cluster {cluster}: {count} contracts ({count/len(df):.1%})")
+    
+    # Visualize clusters in 2D
+    print("Creating 2D visualization of clusters...")
+    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(df) - 1))
+    tsne_results = tsne.fit_transform(normalized_features)
+    
+    # Create a DataFrame for the visualization
+    vis_df = pd.DataFrame({
+        'x': tsne_results[:, 0],
+        'y': tsne_results[:, 1],
+        'cluster': df['cluster']
+    })
+    
+    # Plot the clusters
+    plt.figure(figsize=(12, 8))
+    sns.scatterplot(data=vis_df, x='x', y='y', hue='cluster', palette='viridis', alpha=0.7)
+    plt.title('Contract Clusters Visualization')
+    plt.xlabel('t-SNE Dimension 1')
+    plt.ylabel('t-SNE Dimension 2')
+    plt.savefig('contract_clusters.png')
+    
+    # Analyze the content of each cluster to determine its characteristics
+    print("\nAnalyzing cluster characteristics...")
+    
+    # Get top terms for each cluster
+    cluster_terms = {}
+    
+    for cluster in range(elbow_point):
+        # Get indices of contracts in this cluster
+        cluster_indices = df[df['cluster'] == cluster].index
+        
+        if len(cluster_indices) == 0:
+            continue
+        
+        # Get the centroid of this cluster
+        centroid = kmeans.cluster_centers_[cluster]
+        
+        # Transform centroid back to TF-IDF space
+        centroid_tfidf = centroid.dot(svd.components_)
+        
+        # Get the top terms
+        top_term_indices = centroid_tfidf.argsort()[-20:][::-1]
+        feature_names = tfidf_vectorizer.get_feature_names_out()
+        top_terms = [feature_names[idx] for idx in top_term_indices]
+        
+        cluster_terms[cluster] = top_terms
+        
+        print(f"\nCluster {cluster} ({len(cluster_indices)} contracts):")
+        print(f"Top terms: {', '.join(top_terms[:10])}")
+        
+        # Sample a few contract snippets from this cluster
+        samples = df.loc[cluster_indices].sample(min(3, len(cluster_indices)))
+        for i, (_, sample) in enumerate(samples.iterrows()):
+            snippet = sample['source_code'][:150].replace('\n', ' ')
+            print(f"Sample {i+1}: {snippet}...")
+    
+    # Save the results to CSV with more verbose logging
+    print("\nSaving results...")
+    output_file = 'contract_clusters.csv'
+    
+    try:
+        # Make sure we have data to save
+        if len(df) > 0:
+            # Save with explicit encoding
+            df[['source_code', 'cluster', 'dbscan_cluster']].to_csv(output_file, index=False, encoding='utf-8')
+            print(f"Successfully saved {len(df)} rows to {output_file}")
+            
+            # Verify the file was created
+            if os.path.exists(output_file):
+                file_size = os.path.getsize(output_file) / (1024 * 1024)  # Size in MB
+                print(f"File created: {output_file} ({file_size:.2f} MB)")
+            else:
+                print(f"Warning: File {output_file} was not found after saving!")
+        else:
+            print("Warning: No data to save!")
+    except Exception as e:
+        print(f"Error saving CSV file: {e}")
+        
+        # Try an alternative location
+        alt_output = './contract_clusters.csv'
+        print(f"Trying alternative location: {alt_output}")
+        try:
+            df[['source_code', 'cluster', 'dbscan_cluster']].to_csv(alt_output, index=False, encoding='utf-8')
+            print(f"Successfully saved to alternative location: {alt_output}")
+        except Exception as e2:
+            print(f"Error saving to alternative location: {e2}")
+    
+    # Also save interim results in case the full process has issues
+    try:
+        interim_file = 'contracts_interim_results.csv'
+        df[['cluster', 'dbscan_cluster']].to_csv(interim_file, index=False)
+        print(f"Saved interim results to {interim_file}")
+    except Exception as e:
+        print(f"Error saving interim results: {e}")
+    
+    print("\nClustering complete!")
+    if os.path.exists('contract_clusters.csv'):
+        print("Results saved to 'contract_clusters.csv'")
+    
+    if os.path.exists('elbow_curve.png') and os.path.exists('contract_clusters.png'):
+        print("Visualizations saved as 'elbow_curve.png' and 'contract_clusters.png'")
+    
+    return df
+
+if __name__ == "__main__":
+    # Execute clustering and ensure results are saved properly
+    result_df = cluster_contracts()
+    
+    # Additional save point as a backup
+    try:
+        if result_df is not None and len(result_df) > 0:
+            backup_file = 'contract_clusters_backup.csv'
+            result_df[['source_code', 'cluster', 'dbscan_cluster']].to_csv(backup_file, index=False)
+            print(f"Backup file created: {backup_file}")
+            
+            # Print current working directory for troubleshooting
+            current_dir = os.getcwd()
+            print(f"Files should be saved in: {current_dir}")
+            print("List of files in current directory:")
+            for file in os.listdir(current_dir):
+                if file.endswith('.csv') or file.endswith('.png'):
+                    print(f" - {file}")
+    except Exception as e:
+        print(f"Error creating backup file: {e}")
\ No newline at end of file