diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e500cf24..b264ff4a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,7 +1,7 @@ { - "name": "TM Graph Devcontainer", + "name": "TM Graph Recomm", "dockerComposeFile": "docker-compose.yml", - "service": "tm-graph-development", + "service": "tm-graph-recomm", "workspaceFolder": "/app", "forwardPorts": [], "postCreateCommand": "echo 'Devcontainer is ready'", diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 183c3acd..46271d06 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -1,5 +1,5 @@ services: - tm-graph-development: + tm-graph-recomm: build: context: ../ dockerfile: .devcontainer/Dockerfile @@ -9,4 +9,5 @@ services: devices: - driver: nvidia capabilities: [gpu] - count: 1 # Assign number of GPUs or use 'all' to assign all available GPUs \ No newline at end of file + # count: 1 # Assign number of GPUs or use 'all' to assign all available GPUs + device_ids: ["6"] \ No newline at end of file diff --git a/.gitignore b/.gitignore index 9a53b815..9f5bc781 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +build/ +GraphTsetlinMachine.egg-info/ +/dist/ + .envrc # Byte-compiled / optimized / DLL files diff --git a/examples/MNISTConvolutionDemo.py b/examples/MNISTConvolutionDemo.py index bd4ff9bc..719bf634 100644 --- a/examples/MNISTConvolutionDemo.py +++ b/examples/MNISTConvolutionDemo.py @@ -63,18 +63,13 @@ def default_args(**kwargs): double_hashing = args.double_hashing, one_hot_encoding = args.one_hot_encoding ) - for graph_id in range(X_train.shape[0]): graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_train.prepare_node_configuration() - for graph_id in range(X_train.shape[0]): for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): graphs_train.add_graph_node(graph_id, node_id, 0) - graphs_train.prepare_edge_configuration() - for graph_id in range(X_train.shape[0]): if graph_id % 1000 == 0: print(graph_id, X_train.shape[0]) @@ -90,23 +85,17 @@ def default_args(**kwargs): graphs_train.add_graph_node_property(graph_id, node_id, "C:%d" % (q)) graphs_train.add_graph_node_property(graph_id, node_id, "R:%d" % (r)) - graphs_train.encode() - print("Training data produced") graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) for graph_id in range(X_test.shape[0]): graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_test.prepare_node_configuration() - for graph_id in range(X_test.shape[0]): for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): graphs_test.add_graph_node(graph_id, node_id, 0) - graphs_test.prepare_edge_configuration() - for graph_id in range(X_test.shape[0]): if graph_id % 1000 == 0: print(graph_id, X_test.shape[0]) @@ -122,9 +111,7 @@ def default_args(**kwargs): graphs_test.add_graph_node_property(graph_id, node_id, "C:%d" % (q)) graphs_test.add_graph_node_property(graph_id, node_id, "R:%d" % (r)) - graphs_test.encode() - print("Testing data produced") tm = MultiClassGraphTsetlinMachine( diff --git a/examples/MNISTVanillaDemo.py b/examples/MNISTVanillaDemo.py index a3a9bebb..80cbb056 100644 --- a/examples/MNISTVanillaDemo.py +++ b/examples/MNISTVanillaDemo.py @@ -4,7 +4,6 @@ from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine from time import time import argparse -from skimage.util import view_as_windows from keras.datasets import mnist from numba import jit @@ -55,51 +54,36 @@ def default_args(**kwargs): double_hashing = args.double_hashing, one_hot_encoding = args.one_hot_encoding ) - for graph_id in range(X_train.shape[0]): graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_train.prepare_node_configuration() - for graph_id in range(X_train.shape[0]): number_of_outgoing_edges = 0 graphs_train.add_graph_node(graph_id, 'Image Node', number_of_outgoing_edges) - graphs_train.prepare_edge_configuration() - for graph_id in range(X_train.shape[0]): - if graph_id % 1000 == 0: - print(graph_id, X_train.shape[0]) - + # if graph_id % 1000 == 0: + # print(graph_id, X_train.shape[0]) for k in X_train[graph_id].nonzero()[0]: graphs_train.add_graph_node_property(graph_id, 'Image Node', "W%d,%d" % (k // 28, k % 28)) - graphs_train.encode() - print("Training data produced") graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) - for graph_id in range(X_test.shape[0]): graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) - graphs_test.prepare_node_configuration() - for graph_id in range(X_test.shape[0]): number_of_outgoing_edges = 0 graphs_test.add_graph_node(graph_id, 'Image Node', number_of_outgoing_edges) - graphs_test.prepare_edge_configuration() - for graph_id in range(X_test.shape[0]): if graph_id % 1000 == 0: print(graph_id, X_test.shape[0]) for k in X_test[graph_id].nonzero()[0]: graphs_test.add_graph_node_property(graph_id, 'Image Node', "W%d,%d" % (k // 28, k % 28)) - graphs_test.encode() - print("Testing data produced") tm = MultiClassGraphTsetlinMachine( @@ -128,16 +112,16 @@ def default_args(**kwargs): print("%d %.2f %.2f %.2f %.2f" % (i, result_train, result_test, stop_training-start_training, stop_testing-start_testing)) -weights = tm.get_state()[1].reshape(2, -1) -for i in range(tm.number_of_clauses): - print("Clause #%d Weights:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') - l = [] - for k in range(args.hypervector_size * 2): - if tm.ta_action(0, i, k): - if k < args.hypervector_size: - l.append("x%d" % (k)) - else: - l.append("NOT x%d" % (k - args.hypervector_size)) - print(" AND ".join(l)) - -print(graphs_train.hypervectors) \ No newline at end of file +# weights = tm.get_state()[1].reshape(2, -1) +# for i in range(tm.number_of_clauses): +# print("Clause #%d Weights:(%d %d)" % (i, weights[0,i], weights[1,i]), end=' ') +# l = [] +# for k in range(args.hypervector_size * 2): +# if tm.ta_action(0, i, k): +# if k < args.hypervector_size: +# l.append("x%d" % (k)) +# else: +# l.append("NOT x%d" % (k - args.hypervector_size)) +# print(" AND ".join(l)) + +# print(graphs_train.hypervectors) \ No newline at end of file diff --git a/examples/NoisyXORDemo.py b/examples/NoisyXORDemo.py index 93a6b453..5a22dbcc 100644 --- a/examples/NoisyXORDemo.py +++ b/examples/NoisyXORDemo.py @@ -35,7 +35,6 @@ def default_args(**kwargs): print("Creating training data") # Create train data - graphs_train = Graphs( args.number_of_examples, symbols=['A', 'B'], @@ -48,21 +47,16 @@ def default_args(**kwargs): for graph_id in range(args.number_of_examples): graphs_train.set_number_of_graph_nodes(graph_id, 2) - graphs_train.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_train.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_train.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - -graphs_train.prepare_edge_configuration() - +graphs_train.prepar_eedge_configuration() for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_train.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_train.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_train = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice(['A', 'B']) @@ -78,32 +72,23 @@ def default_args(**kwargs): if np.random.rand() <= args.noise: Y_train[graph_id] = 1 - Y_train[graph_id] - graphs_train.encode() -# Create test data - +# Create test data print("Creating testing data") - graphs_test = Graphs(args.number_of_examples, init_with=graphs_train) - for graph_id in range(args.number_of_examples): graphs_test.set_number_of_graph_nodes(graph_id, 2) - graphs_test.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_test.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_test.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - graphs_test.prepare_edge_configuration() - for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_test.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_test.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_test = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice(['A', 'B']) @@ -116,7 +101,6 @@ def default_args(**kwargs): Y_test[graph_id] = 0 else: Y_test[graph_id] = 1 - graphs_test.encode() tm = MultiClassGraphTsetlinMachine( diff --git a/examples/NoisyXORMNISTDemo.py b/examples/NoisyXORMNISTDemo.py index ff1b3151..5da47877 100644 --- a/examples/NoisyXORMNISTDemo.py +++ b/examples/NoisyXORMNISTDemo.py @@ -54,24 +54,18 @@ def default_args(**kwargs): hypervector_size=args.hypervector_size, hypervector_bits=args.hypervector_bits, ) - for graph_id in range(args.number_of_examples): graphs_train.set_number_of_graph_nodes(graph_id, 2) - graphs_train.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_train.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_train.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - graphs_train.prepare_edge_configuration() - for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_train.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_train.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_train = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice([0, 1]) @@ -91,32 +85,23 @@ def default_args(**kwargs): if np.random.rand() <= args.noise: Y_train[graph_id] = 1 - Y_train[graph_id] - graphs_train.encode() # Create test data - print("Creating testing data") - graphs_test = Graphs(args.number_of_examples, init_with=graphs_train) - for graph_id in range(args.number_of_examples): graphs_test.set_number_of_graph_nodes(graph_id, 2) - graphs_test.prepare_node_configuration() - for graph_id in range(args.number_of_examples): number_of_outgoing_edges = 1 graphs_test.add_graph_node(graph_id, 'Node 1', number_of_outgoing_edges) graphs_test.add_graph_node(graph_id, 'Node 2', number_of_outgoing_edges) - graphs_test.prepare_edge_configuration() - for graph_id in range(args.number_of_examples): edge_type = "Plain" graphs_test.add_graph_node_edge(graph_id, 'Node 1', 'Node 2', edge_type) graphs_test.add_graph_node_edge(graph_id, 'Node 2', 'Node 1', edge_type) - Y_test = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): x1 = random.choice([0, 1]) @@ -133,7 +118,6 @@ def default_args(**kwargs): Y_test[graph_id] = 0 else: Y_test[graph_id] = 1 - graphs_test.encode() tm = MultiClassGraphTsetlinMachine( diff --git a/examples/SequenceClassificationDemo.py b/examples/SequenceClassificationDemo.py index 7a2362cb..c5b13214 100644 --- a/examples/SequenceClassificationDemo.py +++ b/examples/SequenceClassificationDemo.py @@ -35,7 +35,6 @@ def default_args(**kwargs): print("Creating training data") # Create train data - graphs_train = Graphs( args.number_of_examples, symbols=['A'], @@ -43,19 +42,14 @@ def default_args(**kwargs): hypervector_bits=args.hypervector_bits, double_hashing = args.double_hashing ) - for graph_id in range(args.number_of_examples): graphs_train.set_number_of_graph_nodes(graph_id, np.random.randint(args.number_of_classes, args.max_sequence_length+1)) - graphs_train.prepare_node_configuration() - for graph_id in range(args.number_of_examples): for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 graphs_train.add_graph_node(graph_id, node_id, number_of_edges) - graphs_train.prepare_edge_configuration() - Y_train = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): @@ -76,26 +70,19 @@ def default_args(**kwargs): if np.random.rand() <= args.noise: Y_train[graph_id] = np.random.choice(np.setdiff1d(np.arange(args.number_of_classes), [Y_train[graph_id]])) - graphs_train.encode() # Create test data - print("Creating testing data") - graphs_test = Graphs(args.number_of_examples, init_with=graphs_train) for graph_id in range(args.number_of_examples): graphs_test.set_number_of_graph_nodes(graph_id, np.random.randint(args.number_of_classes, args.max_sequence_length+1)) - graphs_test.prepare_node_configuration() - for graph_id in range(args.number_of_examples): for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 graphs_test.add_graph_node(graph_id, node_id, number_of_edges) - graphs_test.prepare_edge_configuration() - Y_test = np.empty(args.number_of_examples, dtype=np.uint32) for graph_id in range(args.number_of_examples): for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): @@ -113,7 +100,6 @@ def default_args(**kwargs): node_id = np.random.randint(Y_test[graph_id], graphs_test.number_of_graph_nodes[graph_id]) for node_pos in range(Y_test[graph_id] + 1): graphs_test.add_graph_node_property(graph_id, node_id - node_pos, 'A') - graphs_test.encode() tm = MultiClassGraphTsetlinMachine( diff --git a/examples/recomm_system/README.md b/examples/recomm_system/README.md new file mode 100644 index 00000000..c03a4deb --- /dev/null +++ b/examples/recomm_system/README.md @@ -0,0 +1,17 @@ +# Recommender System Experiments + +**How to run:** +```sh +cd examples/recomm_system/ +python3 main.py +``` + +**Files:** +- `main.py` — Runs all experiments, calls each model script for various noise ratios, saves results to `experiment_results.csv`. +- `graph_nn.py` — Graph Neural Network (GCN) experiment. +- `graph_tm.py` — Graph Tsetlin Machine experiment. +- `tm_classifier.py` — Tsetlin Machine Classifier experiment. +- `prepare_dataset.py` — Dataset download, noise injection, preprocessing. +- `experiment_results.csv` — Results log (auto-generated). +- `test.ipynb` — Summarizes results, generates LaTeX tables. + diff --git a/examples/recomm_system/__pycache__/prepare_dataset.cpython-310.pyc b/examples/recomm_system/__pycache__/prepare_dataset.cpython-310.pyc new file mode 100644 index 00000000..8a2cd183 Binary files /dev/null and b/examples/recomm_system/__pycache__/prepare_dataset.cpython-310.pyc differ diff --git a/examples/recomm_system/experiment_results.csv b/examples/recomm_system/experiment_results.csv new file mode 100644 index 00000000..45da5c4e --- /dev/null +++ b/examples/recomm_system/experiment_results.csv @@ -0,0 +1,181 @@ +Exp_id,Algorithm,Noise_Ratio,T,s,Max_Included_Literals,Epochs,Platform,Total_Time,Accuracy +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,47.380565881729126,84.23497080802917 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,110.12741780281067,98.63387978142076 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1190.9095215797424,77.11748633879782 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,49.00558853149414,92.65027046203613 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,113.65191793441772,98.44262295081967 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1020.6083555221558,74.86338797814209 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,44.6860625743866,77.13114619255066 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,121.2872724533081,97.78688524590164 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,1246.0178999900818,72.40437158469946 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,82.58793544769287,88.46994638442993 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,137.15939092636108,94.39890710382514 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,1317.742176771164,63.25136612021858 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,54.852065563201904,76.4207661151886 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,151.09674072265625,89.89071038251366 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,1123.5956239700317,49.59016393442623 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,51.210848808288574,68.93442869186401 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.72992277145386,78.5792349726776 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,1148.6567842960358,20.116120218579233 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,48.660605907440186,86.63934469223022 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,110.17098808288574,98.82513661202185 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1061.7185904979706,76.63934426229508 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,49.778627157211304,95.76502442359924 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,113.88378477096558,98.4153005464481 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1058.3029556274414,74.93169398907104 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,39.869826555252075,76.5573799610138 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,120.86488842964172,97.6775956284153 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,1157.85533452034,72.40437158469946 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,39.27051615715027,80.21857738494873 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,137.07859206199646,94.42622950819673 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,1060.4789934158325,64.1051912568306 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,41.18854546546936,78.032785654068 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,150.01649594306946,89.86338797814207 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,1074.8029758930206,49.21448087431694 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,42.942272901535034,68.22404265403748 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.39786314964294,78.0327868852459 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,1051.4041996002197,20.081967213114755 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,48.943641662597656,80.43715953826904 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,111.18853044509888,98.82513661202185 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1000.6668944358826,76.74180327868852 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,34.4648540019989,84.59016680717468 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,113.77461814880371,98.27868852459017 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1045.2479929924011,74.93169398907104 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,40.32768535614014,77.40437388420105 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,120.75347566604614,97.8415300546448 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,1042.6038060188293,72.1311475409836 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,49.051427602767944,76.85792446136475 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,135.81657576560974,94.89071038251366 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,1049.5465006828308,63.69535519125683 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,50.19066071510315,74.07103776931763 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,150.23873829841614,89.69945355191257 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,1161.7163217067719,48.80464480874317 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,42.93249225616455,63.06011080741882 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,169.8643877506256,79.20765027322403 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,968.4304020404816,20.116120218579233 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,46.011924266815186,80.24590015411377 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,109.72403120994568,98.66120218579235 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1174.494342327118,76.74180327868852 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,41.743159532547,80.02732396125793 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,114.41021490097046,98.4153005464481 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1171.6064977645874,74.93169398907104 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,44.349541664123535,87.45901584625244 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,121.4791738986969,97.45901639344262 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,952.0120975971222,71.65300546448088 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,48.69317936897278,75.92896223068237 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,136.3904469013214,94.4535519125683 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,969.868058681488,64.00273224043715 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,44.044572591781616,70.8743155002594 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,149.6289074420929,89.8360655737705 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,953.6086061000824,50.10245901639344 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,44.549598932266235,61.284154653549194 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.53832936286926,79.53551912568307 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,972.7086639404297,20.116120218579233 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,47.114877223968506,81.69398903846741 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,109.53987145423889,98.68852459016394 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,957.2526223659515,76.63934426229508 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,37.89606070518494,85.65573692321777 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,114.25655388832092,98.30601092896175 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1173.4506571292877,74.93169398907104 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,47.68080997467041,83.36065411567688 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,120.15364933013916,97.8688524590164 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,1153.5412156581879,72.1311475409836 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,41.10796904563904,83.41529965400696 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,136.6818916797638,95.0 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,967.7902429103851,63.25136612021858 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,36.63528251647949,82.81420469284058 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,150.54849863052368,89.31693989071037 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,965.3704278469086,49.62431693989071 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,40.28898596763611,64.61748480796814 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,169.49659419059753,79.97267759562841 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,1158.38462972641,20.21857923497268 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,43.29892086982727,77.95081734657288 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,110.77093839645386,98.68852459016394 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,944.0426867008209,76.63934426229508 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,40.48178577423096,91.17486476898193 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,114.66628408432007,98.3879781420765 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1151.3295328617096,74.93169398907104 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,46.342252254486084,91.42076373100281 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,121.12805104255676,97.70491803278688 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,956.9201290607452,72.37021857923497 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,48.09459686279297,90.16393423080444 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,136.35990571975708,94.31693989071039 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,951.3514447212219,64.00273224043715 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,47.61181974411011,77.04917788505554 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,149.66685557365417,90.40983606557377 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,1159.4669754505157,49.62431693989071 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,40.52361035346985,61.666667461395264 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.45302724838257,79.09836065573771 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,942.1310601234436,20.116120218579233 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,42.33190155029297,80.79234957695007 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,110.67640900611877,98.82513661202185 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1151.425032377243,76.63934426229508 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,46.83778142929077,79.94535565376282 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,113.82480311393738,98.25136612021858 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,959.6910009384155,74.86338797814209 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,46.91451978683472,79.26229238510132 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,121.25436019897461,97.81420765027322 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,973.5784142017365,72.1311475409836 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,45.216925859451294,79.56284284591675 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,136.08299708366394,94.89071038251366 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,941.2294843196869,64.1051912568306 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,35.09868001937866,70.24590373039246 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,150.008531332016,89.97267759562841 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,967.8004837036133,49.62431693989071 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,40.60944890975952,60.76502799987793 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.61232328414917,78.52459016393442 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,1174.2148485183716,20.116120218579233 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,44.02885293960571,86.72131299972534 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,110.33011960983276,98.82513661202185 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1164.813972711563,76.74180327868852 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,34.82557439804077,91.83059930801392 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,113.68903136253357,98.30601092896175 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1142.874398946762,74.86338797814209 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,38.12274146080017,84.09836292266846 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,120.88822174072266,97.89617486338797 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,958.9832980632782,72.60928961748634 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,47.38658022880554,83.63388180732727 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,136.52869582176208,95.0 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,982.7437946796417,64.00273224043715 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,50.3098578453064,78.49726676940918 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,150.58712220191956,90.10928961748634 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,952.3902399539948,48.97540983606557 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,47.68881940841675,67.54098534584045 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.5669903755188,78.44262295081967 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,1160.643584728241,20.116120218579233 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,42.35506534576416,80.71038126945496 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,110.72827911376953,98.46994535519126 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1161.2603483200073,76.70765027322405 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,44.48380947113037,75.95628499984741 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,113.78427290916443,98.4153005464481 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1164.732885837555,74.93169398907104 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,41.45829200744629,88.27868700027466 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,121.14582562446594,97.62295081967213 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,971.4570569992065,72.40437158469946 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,44.6593804359436,75.7377028465271 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,136.09871077537537,94.72677595628414 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,1170.4177556037903,64.1051912568306 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,41.33125162124634,78.38797569274902 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,150.5243456363678,89.61748633879782 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,940.6334030628204,49.62431693989071 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,43.79690456390381,63.387978076934814 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.47909784317017,77.34972677595628 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,951.3798985481262,20.116120218579233 +20250409090514,Graph NN,0.005,0,0,0,2000,CPU,44.20913028717041,94.4535493850708 +20250409090514,GraphTM,0.005,10000,10.0,23,10,CUDA,110.41194748878479,98.82513661202185 +20250409090514,TMClassifier,0.005,10000,10.0,32,10,CPU_sparse,1164.4012093544006,76.74180327868852 +20250409090514,Graph NN,0.01,0,0,0,2000,CPU,43.56287693977356,77.86885499954224 +20250409090514,GraphTM,0.01,10000,10.0,23,10,CUDA,113.86108899116516,98.25136612021858 +20250409090514,TMClassifier,0.01,10000,10.0,32,10,CPU_sparse,1165.0554220676422,74.55601092896174 +20250409090514,Graph NN,0.02,0,0,0,2000,CPU,43.18827676773071,90.76502919197083 +20250409090514,GraphTM,0.02,10000,10.0,23,10,CUDA,121.40065360069275,97.6775956284153 +20250409090514,TMClassifier,0.02,10000,10.0,32,10,CPU_sparse,1142.509984254837,72.1311475409836 +20250409090514,Graph NN,0.05,0,0,0,2000,CPU,46.11475706100464,87.2950792312622 +20250409090514,GraphTM,0.05,10000,10.0,23,10,CUDA,136.23513627052307,93.98907103825137 +20250409090514,TMClassifier,0.05,10000,10.0,32,10,CPU_sparse,1018.888409614563,64.1051912568306 +20250409090514,Graph NN,0.1,0,0,0,2000,CPU,46.72879457473755,72.92349934577942 +20250409090514,GraphTM,0.1,10000,10.0,23,10,CUDA,150.53106451034546,89.75409836065575 +20250409090514,TMClassifier,0.1,10000,10.0,32,10,CPU_sparse,1152.1242747306824,49.62431693989071 +20250409090514,Graph NN,0.2,0,0,0,2000,CPU,42.78840351104736,61.72131299972534 +20250409090514,GraphTM,0.2,10000,10.0,23,10,CUDA,170.45607113838196,78.5792349726776 +20250409090514,TMClassifier,0.2,10000,10.0,32,10,CPU_sparse,1150.6887817382812,20.184426229508194 diff --git a/examples/recomm_system/experiment_results.xlsx b/examples/recomm_system/experiment_results.xlsx new file mode 100644 index 00000000..54f85498 Binary files /dev/null and b/examples/recomm_system/experiment_results.xlsx differ diff --git a/examples/recomm_system/graph_nn.py b/examples/recomm_system/graph_nn.py new file mode 100644 index 00000000..ef6922a7 --- /dev/null +++ b/examples/recomm_system/graph_nn.py @@ -0,0 +1,132 @@ +import argparse +import torch +import torch.nn.functional as F +from torch_geometric.data import Data +from torch_geometric.nn import GCNConv +import prepare_dataset +from tmu.tools import BenchmarkTimer +import os +import pandas as pd + +def main(args): + results = [] + data = prepare_dataset.aug_amazon_products(noise_ratio = args.dataset_noise_ratio) + x, y = prepare_dataset.construct_x_y(data) + X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) + # Graph Construction + num_users = len(data['user_id'].unique()) + num_items = len(data['product_id'].unique()) + num_categories = len(data['category'].unique()) + num_nodes = num_users + num_items + num_categories + # Build edge list + edge_list = [] + # User ↔ Item edges + for user, item in zip(X_train[:, 0], X_train[:, 1]): + edge_list.append((user, num_users + item)) # User to Item + edge_list.append((num_users + item, user)) # Item to User + # Item ↔ Category edges + for item, category in zip(X_train[:, 1], X_train[:, 2]): + edge_list.append((num_users + item, num_users + num_items + category)) # Item to Category + edge_list.append((num_users + num_items + category, num_users + item)) # Category to Item + # Create edge index for PyTorch Geometric + edge_index = torch.tensor(edge_list, dtype=torch.long).t() + # Node features + node_features = torch.rand((num_nodes, 64), dtype=torch.float) + # PyTorch Geometric Data object + graph_data = Data(x=node_features, edge_index=edge_index) + # Step 2: Define GCN Model + class GCN(torch.nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim): + super(GCN, self).__init__() + self.conv1 = GCNConv(input_dim, hidden_dim) + self.conv2 = GCNConv(hidden_dim, output_dim) + def forward(self, x, edge_index): + x = self.conv1(x, edge_index) + x = F.relu(x) + x = self.conv2(x, edge_index) + return x + # Initialize Model + model = GCN(input_dim=64, hidden_dim=128, output_dim=64) + # Define optimizer + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + # Convert train/test data to tensors + train_edges = torch.tensor( + [(user, num_users + item) for user, item in zip(X_train[:, 0], X_train[:, 1])], + dtype=torch.long + ).t() + train_labels = torch.tensor(Y_train, dtype=torch.float) + test_edges = torch.tensor( + [(user, num_users + item) for user, item in zip(X_test[:, 0], X_test[:, 1])], + dtype=torch.long + ).t() + test_labels = torch.tensor(Y_test, dtype=torch.float) + # Training Loop with Accuracy Logging + benchmark_total = BenchmarkTimer(logger=None, text="Epochs Time") + with benchmark_total: + for epoch in range(args.epochs): + benchmark1 = BenchmarkTimer(logger=None, text="Training Time") + with benchmark1: + # Training Phase + model.train() + optimizer.zero_grad() + out = model(graph_data.x, graph_data.edge_index) + # User-item embeddings + user_embeddings = out[train_edges[0]] + item_embeddings = out[train_edges[1]] + predicted_ratings = (user_embeddings * item_embeddings).sum(dim=1) + # Compute loss + loss = F.mse_loss(predicted_ratings, train_labels) + loss.backward() + optimizer.step() + train_time = benchmark1.elapsed() + # Testing Phase + benchmark2 = BenchmarkTimer(logger=None, text="Testing Time") + with benchmark2: + model.eval() + with torch.no_grad(): + out = model(graph_data.x, graph_data.edge_index) + test_user_embeddings = out[test_edges[0]] + test_item_embeddings = out[test_edges[1]] + test_predicted_ratings = (test_user_embeddings * test_item_embeddings).sum(dim=1) + # Compute accuracy + accuracy = ((test_predicted_ratings.round() == test_labels).float().mean().item()) * 100 + test_time = benchmark2.elapsed() + total_time = benchmark_total.elapsed() + # Append results for each epoch + results.append({ + "Exp_id": args.exp_id, + "Algorithm": "Graph NN", + "Noise_Ratio": args.dataset_noise_ratio, + "T": 0, + "s": 0, + "Max_Included_Literals": 0, + "Epochs": args.epochs, + "Platform": args.platform, + "Total_Time": total_time, + "Accuracy": accuracy, + }) + + # Save results to CSV + results_df = pd.DataFrame(results) + results_file = "experiment_results.csv" + if os.path.exists(results_file): + results_df.to_csv(results_file, mode='a', index=False, header=False) + else: + results_df.to_csv(results_file, index=False) + print(f"Results saved to {results_file}") + + +def default_args(**kwargs): + parser = argparse.ArgumentParser() + parser.add_argument("--platform", default="CPU", type=str, choices=["CPU", "CUDA"]) + parser.add_argument("--epochs", default=2000, type=int) + parser.add_argument("--dataset_noise_ratio", default=0.01, type=float) + parser.add_argument("--exp_id", default="", type=str) + args = parser.parse_args() + for key, value in kwargs.items(): + if key in args.__dict__: + setattr(args, key, value) + return args + +if __name__ == "__main__": + main(default_args()) \ No newline at end of file diff --git a/examples/recomm_system/graph_tm.py b/examples/recomm_system/graph_tm.py new file mode 100644 index 00000000..27c30828 --- /dev/null +++ b/examples/recomm_system/graph_tm.py @@ -0,0 +1,175 @@ +from GraphTsetlinMachine.graphs import Graphs +from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine +import argparse +import numpy as np +import prepare_dataset +import pandas as pd +from tmu.tools import BenchmarkTimer +import os + +def main(args): + np.random.seed(42) + results = [] + data = prepare_dataset.aug_amazon_products(noise_ratio = args.dataset_noise_ratio) + x, y = prepare_dataset.construct_x_y(data) + X_train, X_test, Y_train, Y_test = prepare_dataset.train_test_split(x,y) + users = data['user_id'].unique() + print("Users: ",len(users)) + + items = data['product_id'].unique() + print("Items: ",len(items)) + + categories = data['category'].unique() + print("Categories: ",len(categories)) + + # Initialize Graphs with symbols for GTM + number_of_nodes = 3 + symbols = [] + symbols = ["U_" + str(u) for u in users] + ["I_" + str(i) for i in items] + ["C_" + str(c) for c in categories] + print("Symbols: ",len(symbols)) + + # Train data + graphs_train = Graphs( + X_train.shape[0], + symbols=symbols, + hypervector_size=args.hypervector_size, + hypervector_bits=args.hypervector_bits, + double_hashing = args.double_hashing + ) + for graph_id in range(X_train.shape[0]): + graphs_train.set_number_of_graph_nodes(graph_id, number_of_nodes) + graphs_train.prepare_node_configuration() + for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_train.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_train.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_train.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_train.add_graph_node(graph_id, "Category", number_of_edges) + graphs_train.prepare_edge_configuration() + for graph_id in range(X_train.shape[0]): + for node_id in range(graphs_train.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_train.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_train.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_train.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_train.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_train.add_graph_node_property(graph_id, "User", "U_" + str(X_train[graph_id][0])) + graphs_train.add_graph_node_property(graph_id, "Item", "I_" + str(X_train[graph_id][1])) + graphs_train.add_graph_node_property(graph_id, "Category", "C_" + str(X_train[graph_id][2])) + graphs_train.encode() + print("Training data produced") + + # Test data + graphs_test = Graphs(X_test.shape[0], init_with=graphs_train) + for graph_id in range(X_test.shape[0]): + graphs_test.set_number_of_graph_nodes(graph_id, number_of_nodes) + graphs_test.prepare_node_configuration() + for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + number_of_edges = 2 if node_id > 0 and node_id < graphs_test.number_of_graph_nodes[graph_id]-1 else 1 + if node_id == 0: + graphs_test.add_graph_node(graph_id, "User", number_of_edges) + elif node_id == 1: + graphs_test.add_graph_node(graph_id, "Item", number_of_edges) + else: + graphs_test.add_graph_node(graph_id, "Category", number_of_edges) + graphs_test.prepare_edge_configuration() + for graph_id in range(X_test.shape[0]): + for node_id in range(graphs_test.number_of_graph_nodes[graph_id]): + if node_id == 0: + graphs_test.add_graph_node_edge(graph_id, "User", "Item", "UserItem") + + if node_id == 1: + graphs_test.add_graph_node_edge(graph_id, "Item", "Category", "ItemCategory") + graphs_test.add_graph_node_edge(graph_id, "Item", "User", "ItemUser") + + if node_id == 2: + graphs_test.add_graph_node_edge(graph_id, "Category", "Item", "CatrgoryItem") + + graphs_test.add_graph_node_property(graph_id, "User", "U_" + str(X_test[graph_id][0])) + graphs_test.add_graph_node_property(graph_id, "Item", "I_" + str(X_test[graph_id][1])) + graphs_test.add_graph_node_property(graph_id, "Category", "C_" + str(X_test[graph_id][2])) + graphs_test.encode() + print("Testing data produced") + + tm = MultiClassGraphTsetlinMachine( + args.number_of_clauses, + args.T, + args.s, + number_of_state_bits = args.number_of_state_bits, + depth=args.depth, + message_size=args.message_size, + message_bits=args.message_bits, + max_included_literals=args.max_included_literals, + double_hashing = args.double_hashing + ) + + benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") + with benchmark_total: + for epoch in range(args.epochs): + benchmark1 = BenchmarkTimer(logger=None, text="Training Time") + with benchmark1: + tm.fit(graphs_train, Y_train, epochs=1, incremental=True) + train_time = benchmark1.elapsed() + + benchmark2 = BenchmarkTimer(logger=None, text="Testing Time") + with benchmark2: + accuracy = 100*(tm.predict(graphs_test) == Y_test).mean() + test_time = benchmark2.elapsed() + total_time = benchmark_total.elapsed() + # result_train = 100*(tm.predict(graphs_train) == Y_train).mean() + results.append({ + "Exp_id": args.exp_id, + "Algorithm": "GraphTM", + "Noise_Ratio": args.dataset_noise_ratio, + "T": args.T, + "s": args.s, + "Max_Included_Literals": args.max_included_literals, + "Epochs": args.epochs, + "Platform": "CUDA", + "Total_Time": total_time, + "Accuracy": accuracy, + }) + + # Save results to CSV + results_df = pd.DataFrame(results) + results_file = "experiment_results.csv" + if os.path.exists(results_file): + results_df.to_csv(results_file, mode='a', index=False, header=False) + else: + results_df.to_csv(results_file, index=False) + print(f"Results saved to {results_file}") + +def default_args(**kwargs): + parser = argparse.ArgumentParser() + parser.add_argument("--epochs", default=10, type=int) + parser.add_argument("--number-of-clauses", default=2000, type=int) + parser.add_argument("--T", default=10000, type=int) + parser.add_argument("--s", default=10.0, type=float) + parser.add_argument("--number-of-state-bits", default=8, type=int) + parser.add_argument("--depth", default=1, type=int) + parser.add_argument("--hypervector-size", default=4096, type=int) + parser.add_argument("--hypervector-bits", default=256, type=int) + parser.add_argument("--message-size", default=256, type=int) + parser.add_argument("--message-bits", default=2, type=int) + parser.add_argument('--double-hashing', dest='double_hashing', default=False, action='store_true') + parser.add_argument("--noise", default=0.01, type=float) + parser.add_argument("--max-included-literals", default=23, type=int) + parser.add_argument("--dataset_noise_ratio", default=0.01, type=float) + parser.add_argument("--exp_id", default="", type=str) + args = parser.parse_args() + for key, value in kwargs.items(): + if key in args.__dict__: + setattr(args, key, value) + return args + +if __name__ == "__main__": + main(default_args()) \ No newline at end of file diff --git a/examples/recomm_system/main.py b/examples/recomm_system/main.py new file mode 100644 index 00000000..a7ba9ef9 --- /dev/null +++ b/examples/recomm_system/main.py @@ -0,0 +1,36 @@ +from datetime import datetime +import graph_nn +import graph_tm +import tm_classifier + +dataset_noise_ratios = [0.005, 0.01, 0.02, 0.05, 0.1, 0.2] +num_iterations = 10 +exp_id = datetime.now().strftime("%Y%m%d%H%M%S") + +print(f"{datetime.now()}, Setup the environment ...") +print(f"Experiment ID: {exp_id}") + +for i in range(1, num_iterations + 1): + print(f"Iteration {i} of {num_iterations}") + + for N in dataset_noise_ratios: + print(f"{datetime.now()}, Running Graph NN ...") + args_nn = graph_nn.default_args( + dataset_noise_ratio=N, + exp_id=exp_id + ) + graph_nn.main(args_nn) + + print(f"{datetime.now()}, Running Graph Tsetlin Machine ...") + args_tm = graph_tm.default_args( + dataset_noise_ratio=N, + exp_id=exp_id + ) + graph_tm.main(args_tm) + + print(f"{datetime.now()}, Running Tsetlin Machine Classifier ...") + args_classifier = tm_classifier.default_args( + dataset_noise_ratio=N, + exp_id=exp_id + ) + tm_classifier.main(args_classifier) \ No newline at end of file diff --git a/examples/recomm_system/main.sh b/examples/recomm_system/main.sh new file mode 100644 index 00000000..82b03778 --- /dev/null +++ b/examples/recomm_system/main.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +echo `date`, Setup the environment ... +set -e # exit if error + +models="graph_tm tm_classifier graph_nn" +dataset_noise_ratios="0.005 0.01 0.02 0.05 0.1 0.2" +num_iterations=10 # Number of times to repeat the experiments +exp_id=$(date +%Y%m%d%H%M%S) + +echo 'Experiment ID: ' $exp_id + +for (( i=1; i<=num_iterations; i++ )) +do + echo "Iteration $i of $num_iterations" + + for N in $dataset_noise_ratios; do + echo `date`, Running Graph NN ... + python3 graph_nn.py --dataset_noise_ratio $N --exp_id $exp_id + + echo `date`, Running Graph Tsetlin Machine ... + python3 graph_tm.py --dataset_noise_ratio $N --exp_id $exp_id + + echo `date`, Running Tsetlin Machine Classifier ... + python3 tm_classifier.py --dataset_noise_ratio $N --exp_id $exp_id + done +done + + diff --git a/examples/recomm_system/prepare_dataset.py b/examples/recomm_system/prepare_dataset.py new file mode 100644 index 00000000..dfe1b50b --- /dev/null +++ b/examples/recomm_system/prepare_dataset.py @@ -0,0 +1,181 @@ +import pandas as pd +import kagglehub +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import OneHotEncoder + +def amazon_products(): + print("Creating training data") + path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset") + print("Path to dataset files:", path) + data_file = path + "/amazon.csv" + org_data = pd.read_csv(data_file) + print("Original data shape:", org_data.shape) + return org_data[['product_id', 'category', 'user_id', 'rating']] + +def aug_amazon_products(noise_ratio = 0.01): + np.random.seed(42) + org_data = amazon_products() + org_data['rating'] = pd.to_numeric(org_data['rating'], errors='coerce') # Coerce invalid values to NaN + org_data.dropna(subset=['rating'], inplace=True) # Drop rows with NaN ratings + org_data['rating'] = org_data['rating'].astype(int) + # Expand the dataset 10 times + data = pd.concat([org_data] * 10, ignore_index=True) + # Shuffle the expanded dataset + data = data.sample(frac=1, random_state=42).reset_index(drop=True) + # Add noise + # Select rows to apply noise + num_noisy_rows = int(noise_ratio * len(data)) + noisy_indices = np.random.choice(data.index, size=num_noisy_rows, replace=False) + # Add noise to ratings + data.loc[noisy_indices, 'rating'] = np.random.choice(range(1, 6), size=num_noisy_rows) + # Add noise to categories + unique_categories = data['category'].unique() + data.loc[noisy_indices, 'category'] = np.random.choice(unique_categories, size=num_noisy_rows) + # Print a preview of the noisy and expanded dataset + print("Expanded data shape:", data.shape) + return data + +def artificial(): + np.random.seed(42) + num_users = 5 # Number of unique users + num_items =10 # Number of unique items + num_categories = 5 # Number of unique categories + num_interactions = 1000 # Number of user-item interactions + # Generate random ratings (e.g., between 1 and 5) + ratings = np.random.choice(range(1, 3), num_interactions) + # Generate random user-item interactions + user_ids = np.random.choice(range(num_users), num_interactions) + item_ids = np.random.choice(range(num_items), num_interactions) + categories = np.random.choice(range(num_categories), num_interactions) + + data = pd.DataFrame({ + 'user_id': user_ids, + 'product_id': item_ids, + 'category': categories, + 'rating': ratings + }) + return data + +def artificial_with_user_pref(): + np.random.seed(42) + num_users = 100 # Number of unique users + num_items = 50 # Number of unique items + num_categories = 50 # Number of unique categories + num_interactions = 1000 # Number of user-item interactions + noise_ratio = 0.01 # Percentage of noisy interactions + + # Generate user preferences: each user prefers 1-3 random categories + user_preferences = { + user: np.random.choice(range(num_categories), size=np.random.randint(1, 4), replace=False) + for user in range(num_users) + } + + # Assign each item to a category + item_categories = {item: np.random.choice(range(num_categories)) for item in range(num_items)} + + # Generate interactions + user_ids = np.random.choice(range(num_users), num_interactions) + item_ids = np.random.choice(range(num_items), num_interactions) + + # Generate ratings based on the pattern + ratings = [] + for user, item in zip(user_ids, item_ids): + item_category = item_categories[item] + if item_category in user_preferences[user]: + ratings.append(np.random.choice([3, 4])) # High rating for preferred categories + else: + ratings.append(np.random.choice([1, 2])) # Low rating otherwise + + # Introduce noise + num_noisy = int(noise_ratio * num_interactions) + noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) + for idx in noisy_indices: + ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating + + # Combine into a DataFrame + data = pd.DataFrame({ + 'user_id': user_ids, + 'product_id': item_ids, + 'category': [item_categories[item] for item in item_ids], + 'rating': ratings + }) + return data + +def artificial_pattered(): + np.random.seed(42) + num_users = 100 # Number of unique users + num_items = 50 # Number of unique items + num_categories = 5 # Number of unique categories + num_interactions = 10000 # Number of user-item interactions + noise_ratio = 0.01 # Percentage of noisy interactions + + # Step 1: Define deterministic user preferences + user_preferences = {user: user % num_categories for user in range(num_users)} + + # Step 2: Assign items to categories in a cyclic pattern + item_categories = {item: item % num_categories for item in range(num_items)} + + # Step 3: Generate deterministic interactions + user_ids = np.arange(num_interactions) % num_users # Cycle through users + item_ids = np.arange(num_interactions) % num_items # Cycle through items + + # Step 4: Generate ratings based on the pattern + ratings = [] + for user, item in zip(user_ids, item_ids): + preferred_category = user_preferences[user] + item_category = item_categories[item] + if item_category == preferred_category: + ratings.append(5) # High rating for preferred category + else: + ratings.append(1) # Low rating otherwise + + # Step 5: Introduce noise + num_noisy = int(noise_ratio * num_interactions) + noisy_indices = np.random.choice(range(num_interactions), num_noisy, replace=False) + for idx in noisy_indices: + ratings[idx] = np.random.choice(range(1, 6)) # Replace with random rating + + # Step 6: Create a DataFrame + data = pd.DataFrame({ + 'user_id': user_ids, + 'product_id': item_ids, + 'category': [item_categories[item] for item in item_ids], + 'rating': ratings + }) + return data + +def construct_x_y(data): + le_user = LabelEncoder() + le_item = LabelEncoder() + le_category = LabelEncoder() + le_rating = LabelEncoder() + data['user_id'] = le_user.fit_transform(data['user_id']) + data['product_id'] = le_item.fit_transform(data['product_id']) + data['category'] = le_category.fit_transform(data['category']) + data['rating'] = le_rating.fit_transform(data['rating']) + x = data[['user_id', 'product_id', 'category']].values + y = data['rating'].values + return x,y + +def split_train_test(x,y): + X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) + print("X_train shape:", X_train.shape) + print("y_train shape:", Y_train.shape) + print("X_test shape:", X_test.shape) + print("y_test shape:", Y_test.shape) + return X_train, X_test, Y_train, Y_test + +def one_hot_encoding(x,y): + encoder = OneHotEncoder(sparse_output=False, dtype=np.uint32) + x_binary = encoder.fit_transform(x) + # print(f"Number of features after one-hot encoding: {x_binary.shape[1]}") + x_train, x_test, y_train, y_test = split_train_test(x_binary, y) + y_train = y_train.astype(np.uint32) + y_test = y_test.astype(np.uint32) + print("x_train shape:", x_train.shape, "dtype:", x_train.dtype) + print("y_train shape:", y_train.shape, "dtype:", y_train.dtype) + print("x_test shape:", x_test.shape, "dtype:", x_test.dtype) + print("y_test shape:", y_test.shape, "dtype:", y_test.dtype) + return x_train, x_test, y_train, y_test \ No newline at end of file diff --git a/examples/recomm_system/test.ipynb b/examples/recomm_system/test.ipynb new file mode 100644 index 00000000..a1834796 --- /dev/null +++ b/examples/recomm_system/test.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating training data\n", + "Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.5), please consider upgrading to the latest version (0.3.6).\n", + "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", + "Original data shape: (1465, 16)\n", + "Expanded data shape: (14640, 4)\n", + "Dataset saved to noisy_dataset_0.005.csv\n", + "Creating training data\n", + "Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.5), please consider upgrading to the latest version (0.3.6).\n", + "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", + "Original data shape: (1465, 16)\n", + "Expanded data shape: (14640, 4)\n", + "Dataset saved to noisy_dataset_0.01.csv\n", + "Creating training data\n", + "Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.5), please consider upgrading to the latest version (0.3.6).\n", + "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", + "Original data shape: (1465, 16)\n", + "Expanded data shape: (14640, 4)\n", + "Dataset saved to noisy_dataset_0.02.csv\n", + "Creating training data\n", + "Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.5), please consider upgrading to the latest version (0.3.6).\n", + "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", + "Original data shape: (1465, 16)\n", + "Expanded data shape: (14640, 4)\n", + "Dataset saved to noisy_dataset_0.05.csv\n", + "Creating training data\n", + "Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.5), please consider upgrading to the latest version (0.3.6).\n", + "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", + "Original data shape: (1465, 16)\n", + "Expanded data shape: (14640, 4)\n", + "Dataset saved to noisy_dataset_0.1.csv\n", + "Creating training data\n", + "Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.5), please consider upgrading to the latest version (0.3.6).\n", + "Path to dataset files: /root/.cache/kagglehub/datasets/karkavelrajaj/amazon-sales-dataset/versions/1\n", + "Original data shape: (1465, 16)\n", + "Expanded data shape: (14640, 4)\n", + "Dataset saved to noisy_dataset_0.2.csv\n" + ] + } + ], + "source": [ + "import prepare_dataset\n", + "import pandas as pd\n", + "import os\n", + "\n", + "dataset_noise_ratios = [0.005,0.01,0.02,0.05,0.1,0.2]\n", + "for noise in dataset_noise_ratios:\n", + " data = prepare_dataset.aug_amazon_products(noise_ratio = noise)\n", + " df = pd.DataFrame(data)\n", + " noise_dataset_file = f\"noisy_dataset_{noise}.csv\"\n", + " if os.path.exists(noise_dataset_file):\n", + " df.to_csv(noise_dataset_file, mode='a', index=False, header=False)\n", + " else:\n", + " df.to_csv(noise_dataset_file, index=False)\n", + " print(f\"Dataset saved to {noise_dataset_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\\begin{table}[h!]\n", + "\\centering\n", + "\\begin{tabular}{|c|c|c|c|}\n", + "\\hline\n", + "\\textbf{Noise Ratio} & \\textbf{GCN (\\%)} & \\textbf{GTM (\\%)} & \\textbf{TMClassifier (\\%)} \\\\ \\hline\n", + "0.005 & 83.39 & 98.73 & 76.73 \\\\ \\hline\n", + "0.01 & 85.55 & 98.35 & 74.87 \\\\ \\hline\n", + "0.02 & 83.57 & 97.73 & 72.24 \\\\ \\hline\n", + "0.05 & 82.13 & 94.61 & 63.86 \\\\ \\hline\n", + "0.1 & 75.93 & 89.85 & 49.48 \\\\ \\hline\n", + "0.2 & 64.12 & 78.73 & 20.13 \\\\ \\hline\n", + "\\end{tabular}\n", + "\\caption{Average accuracy comparison of GCN, GraphTM, and TMClassifier for varying noise ratios.}\n", + "\\label{tab:recomm_sys_accuracy}\n", + "\\end{table}\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "data = pd.read_csv(\"experiment_results.csv\")\n", + "exp_id = \"20250409090514\" \n", + "data['Exp_id'] = data['Exp_id'].astype(str)\n", + "filtered_data = data[data['Exp_id'] == exp_id]\n", + "# print(filtered_data)\n", + "\n", + "# Create a dictionary to store the accuracy values\n", + "noise_accuracies = {}\n", + "\n", + "# Algorithm,Noise_Ratio,T,s,Max_Included_Literals,Epochs,Platform,Total_Time,Accuracy\n", + "# Group the data by Algorithm and Noise Ratio to calculate average accuracies\n", + "grouped_data = filtered_data.groupby(['Algorithm', 'Noise_Ratio']).agg({'Accuracy': 'mean'}).reset_index()\n", + "\n", + "# Pivot the data to get a structure suitable for LaTeX table generation\n", + "pivot_data = grouped_data.pivot(index='Noise_Ratio', columns='Algorithm', values='Accuracy')\n", + " \n", + "# Generate LaTeX table\n", + "latex_table = \"\"\"\n", + "\\\\begin{table}[h!]\n", + "\\\\centering\n", + "\\\\begin{tabular}{|c|c|c|c|}\n", + "\\\\hline\n", + "\\\\textbf{Noise Ratio} & \\\\textbf{GCN (\\\\%)} & \\\\textbf{GTM (\\\\%)} & \\\\textbf{TMClassifier (\\\\%)} \\\\\\\\ \\\\hline\n", + "\"\"\"\n", + "\n", + "# Iterate over the pivot data to construct the table rows\n", + "for noise_ratio, row in pivot_data.iterrows():\n", + " latex_table += f\"{noise_ratio} & \"\n", + " latex_table += f\"{row['Graph NN']:.2f} & {row['GraphTM']:.2f} & {row['TMClassifier']:.2f} \\\\\\\\ \\\\hline\\n\"\n", + "\n", + "latex_table += \"\\\\end{tabular}\\n\"\n", + "latex_table += \"\\\\caption{Average accuracy comparison of GCN, GraphTM, and TMClassifier for varying noise ratios.}\\n\"\n", + "latex_table += \"\\\\label{tab:recomm_sys_accuracy}\\n\"\n", + "latex_table += \"\\\\end{table}\"\n", + "\n", + "print(latex_table)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\\begin{table}[h!]\n", + "\\centering\n", + "\\begin{tabular}{|c|c|c|c|}\n", + "\\hline\n", + "\\textbf{Noise Ratio} & \\textbf{GCN (\\%)} & \\textbf{GraphTM (\\%)} & \\textbf{TMClassifier (\\%)} \\\\ \\hline\n", + "0.005 & 83.39 \\pm 4.83 & 98.73 \\pm 0.12 & 76.73 \\pm 0.14 \\\\ \\hline\n", + "0.01 & 85.55 \\pm 6.99 & 98.35 \\pm 0.08 & 74.87 \\pm 0.12 \\\\ \\hline\n", + "0.02 & 83.57 \\pm 5.76 & 97.73 \\pm 0.13 & 72.24 \\pm 0.26 \\\\ \\hline\n", + "0.05 & 82.13 \\pm 5.30 & 94.61 \\pm 0.34 & 63.86 \\pm 0.34 \\\\ \\hline\n", + "0.1 & 75.93 \\pm 3.89 & 89.85 \\pm 0.29 & 49.48 \\pm 0.38 \\\\ \\hline\n", + "0.2 & 64.12 \\pm 3.07 & 78.73 \\pm 0.75 & 20.13 \\pm 0.04 \\\\ \\hline\n", + "\\end{tabular}\n", + "\\caption{Average accuracy and standard deviation comparison of GCN, GraphTM, and TMClassifier for varying noise ratios.}\n", + "\\label{tab:recomm_sys_accuracy}\n", + "\\{table}\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Load the data\n", + "data = pd.read_csv(\"experiment_results.csv\")\n", + "exp_id = \"20250409090514\"\n", + "data['Exp_id'] = data['Exp_id'].astype(str)\n", + "\n", + "# Filter the data for the specified experiment ID\n", + "filtered_data = data[data['Exp_id'] == exp_id]\n", + "\n", + "# Group the data by Algorithm and Noise Ratio to calculate average accuracies and standard deviations\n", + "grouped_data = filtered_data.groupby(['Algorithm', 'Noise_Ratio']).agg(\n", + " Accuracy_mean=('Accuracy', 'mean'),\n", + " Accuracy_std=('Accuracy', 'std')\n", + ").reset_index()\n", + "\n", + "# Pivot the data to get a structure suitable for LaTeX table generation\n", + "pivot_data_mean = grouped_data.pivot(index='Noise_Ratio', columns='Algorithm', values='Accuracy_mean')\n", + "pivot_data_std = grouped_data.pivot(index='Noise_Ratio', columns='Algorithm', values='Accuracy_std')\n", + "\n", + "# Start building the LaTeX table\n", + "latex_table = \"\"\"\n", + "\\\\begin{table}[h!]\n", + "\\\\centering\n", + "\\\\begin{tabular}{|c|c|c|c|}\n", + "\\\\hline\n", + "\\\\textbf{Noise Ratio} & \\\\textbf{GCN (\\\\%)} & \\\\textbf{GraphTM (\\\\%)} & \\\\textbf{TMClassifier (\\\\%)} \\\\\\\\ \\\\hline\n", + "\"\"\"\n", + "\n", + "# Iterate over the pivot data to construct the table rows with mean and standard deviation\n", + "for noise_ratio in pivot_data_mean.index:\n", + " gcn_mean = pivot_data_mean.loc[noise_ratio, 'Graph NN']\n", + " gcn_std = pivot_data_std.loc[noise_ratio, 'Graph NN']\n", + " \n", + " graph_tm_mean = pivot_data_mean.loc[noise_ratio, 'GraphTM']\n", + " graph_tm_std = pivot_data_std.loc[noise_ratio, 'GraphTM']\n", + " \n", + " tm_classifier_mean = pivot_data_mean.loc[noise_ratio, 'TMClassifier']\n", + " tm_classifier_std = pivot_data_std.loc[noise_ratio, 'TMClassifier']\n", + "\n", + " latex_table += f\"{noise_ratio} & \"\n", + " latex_table += f\"{gcn_mean:.2f} \\\\pm {gcn_std:.2f} & \"\n", + " latex_table += f\"{graph_tm_mean:.2f} \\\\pm {graph_tm_std:.2f} & \"\n", + " latex_table += f\"{tm_classifier_mean:.2f} \\\\pm {tm_classifier_std:.2f} \\\\\\\\ \\\\hline\\n\"\n", + "\n", + "latex_table += \"\\\\end{tabular}\\n\"\n", + "latex_table += \"\\\\caption{Average accuracy and standard deviation comparison of GCN, GraphTM, and TMClassifier for varying noise ratios.}\\n\"\n", + "latex_table += \"\\\\label{tab:recomm_sys_accuracy}\\n\"\n", + "latex_table += \"\\\\{table}\"\n", + "\n", + "print(latex_table)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Averages across all noise ratios:\n", + "Algorithm: Graph NN, Average Accuracy: 79.11%, Average Total Time: 44.80s\n", + "Algorithm: GraphTM, Average Accuracy: 93.00%, Average Total Time: 133.75s\n", + "Algorithm: TMClassifier, Average Accuracy: 59.55%, Average Total Time: 1068.99s\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Read the CSV file\n", + "data = pd.read_csv(\"experiment_results.csv\")\n", + "\n", + "# Define the experiment ID you want to filter\n", + "exp_id = \"20250409090514\"\n", + "\n", + "# Ensure that Exp_id is treated as a string\n", + "data['Exp_id'] = data['Exp_id'].astype(str)\n", + "\n", + "# Filter the data based on the experiment ID\n", + "filtered_data = data[data['Exp_id'] == exp_id]\n", + "\n", + "# Group the data by Algorithm to calculate average accuracies and total time across all noise ratios\n", + "grouped_data = filtered_data.groupby('Algorithm').agg({'Accuracy': 'mean', 'Total_Time': 'mean'}).reset_index()\n", + "\n", + "# Print the average results for each algorithm across all noise ratios\n", + "print(\"Averages across all noise ratios:\")\n", + "for _, row in grouped_data.iterrows():\n", + " algorithm = row['Algorithm']\n", + " average_accuracy = row['Accuracy']\n", + " average_total_time = row['Total_Time']\n", + " \n", + " # Print the results\n", + " print(f\"Algorithm: {algorithm}, Average Accuracy: {average_accuracy:.2f}%, Average Total Time: {average_total_time:.2f}s\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/recomm_system/tm_classifier.py b/examples/recomm_system/tm_classifier.py new file mode 100644 index 00000000..cb6cb458 --- /dev/null +++ b/examples/recomm_system/tm_classifier.py @@ -0,0 +1,76 @@ +import argparse +from tmu.models.classification.vanilla_classifier import TMClassifier +from tmu.tools import BenchmarkTimer +import prepare_dataset +import pandas as pd +import os + +def main(args): + results = [] + data = prepare_dataset.aug_amazon_products(noise_ratio = args.dataset_noise_ratio) + x, y = prepare_dataset.construct_x_y(data) + X_train, X_test, Y_train, Y_test = prepare_dataset.one_hot_encoding(x,y) + tm = TMClassifier( + number_of_clauses=args.num_clauses, + T=args.T, + s=args.s, + max_included_literals=args.max_included_literals, + platform=args.platform, + weighted_clauses=args.weighted_clauses, + ) + + benchmark_total = BenchmarkTimer(logger=None, text="Epoch Time") + with benchmark_total: + for epoch in range(args.epochs): + benchmark1 = BenchmarkTimer(logger=None, text="Training Time") + with benchmark1: + tm.fit(X_train, Y_train) + train_time = benchmark1.elapsed() + benchmark2 = BenchmarkTimer(logger=None, text="Testing Time") + with benchmark2: + accuracy = 100 * (tm.predict(X_test) == Y_test).mean() + test_time = benchmark2.elapsed() + total_time = benchmark_total.elapsed() + + # Append results for each epoch + results.append({ + "Exp_id": args.exp_id, + "Algorithm": "TMClassifier", + "Noise_Ratio": args.dataset_noise_ratio, + "T": args.T, + "s": args.s, + "Max_Included_Literals": args.max_included_literals, + "Epochs": args.epochs, + "Platform": args.platform, + "Total_Time": total_time, + "Accuracy": accuracy, + }) + + # Save results to CSV + results_df = pd.DataFrame(results) + results_file = "experiment_results.csv" + if os.path.exists(results_file): + results_df.to_csv(results_file, mode='a', index=False, header=False) + else: + results_df.to_csv(results_file, index=False) + print(f"Results saved to {results_file}") + +def default_args(**kwargs): + parser = argparse.ArgumentParser() + parser.add_argument("--num_clauses", default=2000, type=int) + parser.add_argument("--T", default=10000, type=int) + parser.add_argument("--s", default=10.0, type=float) + parser.add_argument("--max_included_literals", default=32, type=int) + parser.add_argument("--platform", default="CPU_sparse", type=str, choices=["CPU", "CPU_sparse", "CUDA"]) + parser.add_argument("--weighted_clauses", default=True, type=bool) + parser.add_argument("--epochs", default=10, type=int) + parser.add_argument("--dataset_noise_ratio", default=0.01, type=float) + parser.add_argument("--exp_id", default="", type=str) + args = parser.parse_args() + for key, value in kwargs.items(): + if key in args.__dict__: + setattr(args, key, value) + return args + +if __name__ == "__main__": + main(default_args()) \ No newline at end of file diff --git a/requirments.txt b/requirments.txt new file mode 100644 index 00000000..12b86c03 --- /dev/null +++ b/requirments.txt @@ -0,0 +1,7 @@ +numpy +numba +pycuda +scipy +pandas +kagglehub +scikit-learn \ No newline at end of file