diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..66f8497f61c6d90d3faadafe0b306301dd6e0654 --- /dev/null +++ b/main.py @@ -0,0 +1,269 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import KMeans, DBSCAN +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.preprocessing import normalize +from sklearn.manifold import TSNE +import re +import os +from tqdm import tqdm +import warnings +warnings.filterwarnings("ignore") + +# File path - update with your actual file path +file_path = "/Users/awa01/Downloads/data/validation.csv" + +def preprocess_code(code): + """ + Preprocess the source code to improve clustering quality + """ + if not isinstance(code, str): + return "" + + # Convert to lowercase + code = code.lower() + + # Remove comments (both // and /* ... */ style) + code = re.sub(r'//.*?(\n|$)', ' ', code) # Remove single line comments + code = re.sub(r'/\*.*?\*/', ' ', code, flags=re.DOTALL) # Remove multi-line comments + + # Remove string literals + code = re.sub(r'".*?"', 'STRING_LITERAL', code) + code = re.sub(r"'.*?'", 'STRING_LITERAL', code) + + # Remove numbers + code = re.sub(r'\b\d+\b', 'NUMBER', code) + + # Remove extra whitespace + code = re.sub(r'\s+', ' ', code).strip() + + return code + +def cluster_contracts(): + """ + Load and cluster Ethereum contracts based on source code + """ + print("Loading Ethereum contracts...") + try: + df = pd.read_csv(file_path) + except Exception as e: + print(f"Error reading CSV file: {e}") + return + + # Basic data information + print(f"Loaded {len(df)} contracts") + + # Check if source_code column exists + if 'source_code' not in df.columns: + print("Error: 'source_code' column not found in the CSV file.") + print(f"Available columns: {df.columns.tolist()}") + return + + # Remove any rows with null source code + df = df.dropna(subset=['source_code']) + print(f"Contracts with non-null source code: {len(df)}") + + # Preprocess source codes + print("Preprocessing source code...") + df['processed_code'] = df['source_code'].apply(preprocess_code) + + # Filter out empty processed code + df = df[df['processed_code'].str.len() > 0] + print(f"Contracts after preprocessing: {len(df)}") + + # Extract features using TF-IDF + print("Extracting features using TF-IDF...") + tfidf_vectorizer = TfidfVectorizer( + max_features=5000, # Limit features to reduce dimensionality + ngram_range=(1, 2), # Use unigrams and bigrams + stop_words='english', + min_df=5, # Ignore terms that appear in less than 5 documents + max_df=0.8 # Ignore terms that appear in more than 80% of documents + ) + + # Create the TF-IDF matrix + tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_code']) + print(f"TF-IDF matrix shape: {tfidf_matrix.shape}") + + # Dimensionality reduction for visualization and to improve clustering + print("Performing dimensionality reduction...") + n_components = min(100, tfidf_matrix.shape[0] - 1, tfidf_matrix.shape[1] - 1) + svd = TruncatedSVD(n_components=n_components, random_state=42) + reduced_features = svd.fit_transform(tfidf_matrix) + + # Normalize the reduced features + normalized_features = normalize(reduced_features) + print(f"Reduced features shape: {normalized_features.shape}") + + # Determine optimal number of clusters using elbow method + print("Determining optimal number of clusters...") + max_clusters = min(20, len(df) - 1) # Limit to 20 clusters or less + inertias = [] + + for k in range(2, max_clusters + 1): + kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) + kmeans.fit(normalized_features) + inertias.append(kmeans.inertia_) + + # Plot elbow curve + plt.figure(figsize=(10, 6)) + plt.plot(range(2, max_clusters + 1), inertias, 'o-') + plt.xlabel('Number of Clusters') + plt.ylabel('Inertia') + plt.title('Elbow Method for Optimal Number of Clusters') + plt.grid(True) + plt.savefig('elbow_curve.png') + + # Find the elbow point (this is a simple heuristic) + differences = np.diff(inertias) + elbow_point = np.argmin(differences) + 2 # +2 because we started from 2 clusters + print(f"Estimated optimal number of clusters: {elbow_point}") + + # Perform K-means clustering with the optimal number of clusters + print(f"Performing K-means clustering with {elbow_point} clusters...") + kmeans = KMeans(n_clusters=20, random_state=42, n_init=10) + df['cluster'] = kmeans.fit_predict(normalized_features) + + # Alternative: Try DBSCAN clustering + print("Also trying DBSCAN clustering...") + dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine') + df['dbscan_cluster'] = dbscan.fit_predict(normalized_features) + + # Count contracts in each cluster + kmeans_cluster_counts = df['cluster'].value_counts().sort_index() + print("\nK-means Cluster Distribution:") + for cluster, count in kmeans_cluster_counts.items(): + print(f"Cluster {cluster}: {count} contracts ({count/len(df):.1%})") + + dbscan_cluster_counts = df['dbscan_cluster'].value_counts().sort_index() + print("\nDBSCAN Cluster Distribution:") + for cluster, count in dbscan_cluster_counts.items(): + print(f"Cluster {cluster}: {count} contracts ({count/len(df):.1%})") + + # Visualize clusters in 2D + print("Creating 2D visualization of clusters...") + tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(df) - 1)) + tsne_results = tsne.fit_transform(normalized_features) + + # Create a DataFrame for the visualization + vis_df = pd.DataFrame({ + 'x': tsne_results[:, 0], + 'y': tsne_results[:, 1], + 'cluster': df['cluster'] + }) + + # Plot the clusters + plt.figure(figsize=(12, 8)) + sns.scatterplot(data=vis_df, x='x', y='y', hue='cluster', palette='viridis', alpha=0.7) + plt.title('Contract Clusters Visualization') + plt.xlabel('t-SNE Dimension 1') + plt.ylabel('t-SNE Dimension 2') + plt.savefig('contract_clusters.png') + + # Analyze the content of each cluster to determine its characteristics + print("\nAnalyzing cluster characteristics...") + + # Get top terms for each cluster + cluster_terms = {} + + for cluster in range(elbow_point): + # Get indices of contracts in this cluster + cluster_indices = df[df['cluster'] == cluster].index + + if len(cluster_indices) == 0: + continue + + # Get the centroid of this cluster + centroid = kmeans.cluster_centers_[cluster] + + # Transform centroid back to TF-IDF space + centroid_tfidf = centroid.dot(svd.components_) + + # Get the top terms + top_term_indices = centroid_tfidf.argsort()[-20:][::-1] + feature_names = tfidf_vectorizer.get_feature_names_out() + top_terms = [feature_names[idx] for idx in top_term_indices] + + cluster_terms[cluster] = top_terms + + print(f"\nCluster {cluster} ({len(cluster_indices)} contracts):") + print(f"Top terms: {', '.join(top_terms[:10])}") + + # Sample a few contract snippets from this cluster + samples = df.loc[cluster_indices].sample(min(3, len(cluster_indices))) + for i, (_, sample) in enumerate(samples.iterrows()): + snippet = sample['source_code'][:150].replace('\n', ' ') + print(f"Sample {i+1}: {snippet}...") + + # Save the results to CSV with more verbose logging + print("\nSaving results...") + output_file = 'contract_clusters.csv' + + try: + # Make sure we have data to save + if len(df) > 0: + # Save with explicit encoding + df[['source_code', 'cluster', 'dbscan_cluster']].to_csv(output_file, index=False, encoding='utf-8') + print(f"Successfully saved {len(df)} rows to {output_file}") + + # Verify the file was created + if os.path.exists(output_file): + file_size = os.path.getsize(output_file) / (1024 * 1024) # Size in MB + print(f"File created: {output_file} ({file_size:.2f} MB)") + else: + print(f"Warning: File {output_file} was not found after saving!") + else: + print("Warning: No data to save!") + except Exception as e: + print(f"Error saving CSV file: {e}") + + # Try an alternative location + alt_output = './contract_clusters.csv' + print(f"Trying alternative location: {alt_output}") + try: + df[['source_code', 'cluster', 'dbscan_cluster']].to_csv(alt_output, index=False, encoding='utf-8') + print(f"Successfully saved to alternative location: {alt_output}") + except Exception as e2: + print(f"Error saving to alternative location: {e2}") + + # Also save interim results in case the full process has issues + try: + interim_file = 'contracts_interim_results.csv' + df[['cluster', 'dbscan_cluster']].to_csv(interim_file, index=False) + print(f"Saved interim results to {interim_file}") + except Exception as e: + print(f"Error saving interim results: {e}") + + print("\nClustering complete!") + if os.path.exists('contract_clusters.csv'): + print("Results saved to 'contract_clusters.csv'") + + if os.path.exists('elbow_curve.png') and os.path.exists('contract_clusters.png'): + print("Visualizations saved as 'elbow_curve.png' and 'contract_clusters.png'") + + return df + +if __name__ == "__main__": + # Execute clustering and ensure results are saved properly + result_df = cluster_contracts() + + # Additional save point as a backup + try: + if result_df is not None and len(result_df) > 0: + backup_file = 'contract_clusters_backup.csv' + result_df[['source_code', 'cluster', 'dbscan_cluster']].to_csv(backup_file, index=False) + print(f"Backup file created: {backup_file}") + + # Print current working directory for troubleshooting + current_dir = os.getcwd() + print(f"Files should be saved in: {current_dir}") + print("List of files in current directory:") + for file in os.listdir(current_dir): + if file.endswith('.csv') or file.endswith('.png'): + print(f" - {file}") + except Exception as e: + print(f"Error creating backup file: {e}") \ No newline at end of file