diff --git a/1.png b/1.png
new file mode 100644
index 0000000..3ed5872
Binary files /dev/null and b/1.png differ
diff --git a/2.png b/2.png
new file mode 100644
index 0000000..3f50d62
Binary files /dev/null and b/2.png differ
diff --git a/3.png b/3.png
new file mode 100644
index 0000000..56fc8cb
Binary files /dev/null and b/3.png differ
diff --git a/4.png b/4.png
new file mode 100644
index 0000000..92bcb17
Binary files /dev/null and b/4.png differ
diff --git a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj
index 4515c57..b0d950c 100755
--- a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj
+++ b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj
@@ -25,6 +25,8 @@
     <ClInclude Include="..\..\src\interactions.h" />
     <ClInclude Include="..\..\src\intersections.h" />
     <ClInclude Include="..\..\src\main.h" />
+    <ClInclude Include="..\..\src\main_initializer.h" />
+    <ClInclude Include="..\..\src\main_runtime.h" />
     <ClInclude Include="..\..\src\raytraceKernel.h" />
     <ClInclude Include="..\..\src\scene.h" />
     <ClInclude Include="..\..\src\sceneStructs.h" />
@@ -36,6 +38,8 @@
     <ClCompile Include="..\..\src\glslUtility.cpp" />
     <ClCompile Include="..\..\src\image.cpp" />
     <ClCompile Include="..\..\src\main.cpp" />
+    <ClCompile Include="..\..\src\main_initializer.cpp" />
+    <ClCompile Include="..\..\src\main_runtime.cpp" />
     <ClCompile Include="..\..\src\scene.cpp" />
     <ClCompile Include="..\..\src\stb_image\stb_image.c" />
     <ClCompile Include="..\..\src\stb_image\stb_image_write.c" />
@@ -202,4 +206,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Project="..\Build\CUDA 4.0.targets" />
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters
index d49ad9c..cccbe68 100755
--- a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters
+++ b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters
@@ -12,6 +12,8 @@
     <ClCompile Include="..\..\src\stb_image\stb_image_write.c">
       <Filter>stb_image</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\main_initializer.cpp" />
+    <ClCompile Include="..\..\src\main_runtime.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\cudaMat4.h" />
@@ -30,6 +32,8 @@
     <ClInclude Include="..\..\src\stb_image\stb_image.h">
       <Filter>stb_image</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\main_initializer.h" />
+    <ClInclude Include="..\..\src\main_runtime.h" />
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="..\..\src\raytraceKernel.cu" />
diff --git a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user
index dfd9f6c..6f97d04 100755
--- a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user
+++ b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user
@@ -16,4 +16,9 @@
     <LocalDebuggerCommandArguments>scene=../../scenes/sampleScene.txt</LocalDebuggerCommandArguments>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
-</Project>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LocalDebuggerCommandArguments>scene=../../scenes/color_bleeding.txt frame=0</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <ShowAllFiles>false</ShowAllFiles>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/README.md b/README.md
index 324bd3e..45c5005 100755
--- a/README.md
+++ b/README.md
@@ -1,148 +1,85 @@
--------------------------------------------------------------------------------
-CIS565: Project 2: CUDA Pathtracer
--------------------------------------------------------------------------------
-Fall 2013
--------------------------------------------------------------------------------
-Due Wednesday, 10/02/13
--------------------------------------------------------------------------------
 
--------------------------------------------------------------------------------
-NOTE:
--------------------------------------------------------------------------------
-This project requires an NVIDIA graphics card with CUDA capability! Any card after the Geforce 8xxx series will work. If you do not have an NVIDIA graphics card in the machine you are working on, feel free to use any machine in the SIG Lab or in Moore100 labs. All machines in the SIG Lab and Moore100 are equipped with CUDA capable NVIDIA graphics cards. If this too proves to be a problem, please contact Patrick or Liam as soon as possible.
+CUDA Pathtracer (Fall 2013 CIS565 Project 2)
+============================================
 
--------------------------------------------------------------------------------
-INTRODUCTION:
--------------------------------------------------------------------------------
-In this project, you will extend your raytracer from Project 1 into a full CUDA based global illumination pathtracer. 
+(Due Wednesday, 10/02/13)  
+(Note: code and rendering in progress)
 
-For this project, you may either choose to continue working off of your codebase from Project 1, or you may choose to use the included basecode in this repository. The basecode for Project 2 is the same as the basecode for Project 1, but with some missing components you will need filled in, such as the intersection testing and camera raycasting methods. 
+Full global illumination (note soft shadows and color bleeding)
+![Diffuse](https://raw.github.com/takfuruya/Project2-Pathtracer/master/renders/color_bleeding.0.8.bmp)
 
-How you choose to extend your raytracer into a pathtracer is a fairly open-ended problem; the supplied basecode is meant to serve as one possible set of guidelines for doing so, but you may choose any approach you want in your actual implementation, including completely scrapping the provided basecode in favor of your own from-scratch solution.
+Perfect specular reflection
+![Reflective](https://raw.github.com/takfuruya/Project2-Pathtracer/master/renders/color_bleeding.0.10.bmp)
+
+Supersampled antialiasing
+![Anti-aliasing](https://raw.github.com/takfuruya/Project2-Pathtracer/master/anti-aliasing.png)
 
 -------------------------------------------------------------------------------
-CONTENTS:
+Performance Analysis
 -------------------------------------------------------------------------------
-The Project2 root directory contains the following subdirectories:
-	
-* src/ contains the source code for the project. Both the Windows Visual Studio solution and the OSX makefile reference this folder for all source; the base source code compiles on OSX and Windows without modification.
-* scenes/ contains an example scene description file.
-* renders/ contains two example renders: the raytraced render from Project 1 (GI_no.bmp), and the same scene rendered with global illumination (GI_yes.bmp). 
-* PROJ1_WIN/ contains a Windows Visual Studio 2010 project and all dependencies needed for building and running on Windows 7.
-* PROJ1_OSX/ contains a OSX makefile, run script, and all dependencies needed for building and running on Mac OSX 10.8. 
-* PROJ1_NIX/ contains a Linux makefile for building and running on Ubuntu 
-  12.04 LTS. Note that you will need to set the following environment
-  variables: 
-    
-  - PATH=$PATH:/usr/local/cuda-5.5/bin
-  - LD_LIBRARY_PATH=/usr/local/cuda-5.5/lib64:/lib
-
-The projects build and run exactly the same way as in Project0 and Project1.
+Program was parallelized by ray (instead of by pixel) via stream compaction.  
+
+***In progress.***
 
 -------------------------------------------------------------------------------
-REQUIREMENTS:
+Nsight Insights
 -------------------------------------------------------------------------------
-In this project, you are given code for:
 
-* All of the basecode from Project 1, plus:
-* Intersection testing code for spheres and cubes
-* Code for raycasting from the camera
+My program was running fine when "Generate GPU Debug Information" was set to "Yes" but immediately
+stopped execution when it was set to "No". No changes in code. What is happening?
 
-You will need to implement the following features. A number of these required features you may have already implemented in Project 1. If you have, you are ahead of the curve and have less work to do! 
+![Generate GPU Debug Information](https://raw.github.com/takfuruya/Project2-Pathtracer/master/1.png)
 
-* Full global illumination (including soft shadows, color bleeding, etc.) by pathtracing rays through the scene. 
-* Properly accumulating emittance and colors to generate a final image
-* Supersampled antialiasing
-* Parallelization by ray instead of by pixel via stream compaction (you may use Thrust for this).
-* Perfect specular reflection
+I identified the issue on the host by running the following code after each kernel launch:
 
-You are also required to implement at least two of the following features. Some of these features you may have already implemented in Project 1. If you have, you may NOT resubmit those features and instead must pick two new ones to implement.
+	cudaError_t err = cudaGetLastError();
+	if( cudaSuccess != err) {
+		fprintf(stderr, "Cuda error: %s.\n", cudaGetErrorString(err)); 
+		exit(EXIT_FAILURE);
+	}
 
-* From scratch stream compaction (no Thrust).
-* Additional BRDF models, such as Cook-Torrance, Ward, etc. Each BRDF model may count as a separate feature. 
-* Texture mapping 
-* Bump mapping
-* Translational motion blur
-* Fresnel-based Refraction, i.e. glass
-* OBJ Mesh loading and rendering without KD-Tree
-* Interactive camera
-* Integrate an existing stackless KD-Tree library, such as CUKD (https://github.com/unvirtual/cukd)
-* Depth of field
+I found that my path tracer kernel was not launching due to "too many resources requested for launch".
+First, I checked the number of threads launched:
 
-Alternatively, implementing just one of the following features can satisfy the "pick two" feature requirement, since these are correspondingly more difficult problems:
+	int num_threads_per_block = 512;
+	int num_blocks_per_grid = ceil((float)num_rays / num_threads_per_block);
+	TraceRay<<<num_blocks_per_grid, num_threads_per_block>>>(...
 
-* Physically based subsurface scattering and transmission
-* Implement and integrate your own stackless KD-Tree from scratch. 
-* Displacement mapping
-* Deformational motion blur
+I was working on a device with compute capability 1.2 (GeForce 310M) whose specs are:
 
-As yet another alternative, if you have a feature or features you really want to implement that are not on this list, let us know, and we'll probably say yes!
+ - max 512 threads/block
+ - max 65535 blocks/grid
+ - 2 SM
+ - 8 blocks/SM
 
--------------------------------------------------------------------------------
-NOTES ON GLM:
--------------------------------------------------------------------------------
-This project uses GLM, the GL Math library, for linear algebra. You need to know two important points on how GLM is used in this project:
+I am using ```ceil(800*800 / 512) = 1250``` blocks per grid so I am within limits.
+Then I checked the amount of memory used in this kernel by adding "ptxas" flag to nvcc.
 
-* In this project, indices in GLM vectors (such as vec3, vec4), are accessed via swizzling. So, instead of v[0], v.x is used, and instead of v[1], v.y is used, and so on and so forth.
-* GLM Matrix operations work fine on NVIDIA Fermi cards and later, but pre-Fermi cards do not play nice with GLM matrices. As such, in this project, GLM matrices are replaced with a custom matrix struct, called a cudaMat4, found in cudaMat4.h. A custom function for multiplying glm::vec4s and cudaMat4s is provided as multiplyMV() in intersections.h.
+![ptxas Flag](https://raw.github.com/takfuruya/Project2-Pathtracer/master/2.png)
 
--------------------------------------------------------------------------------
-README
--------------------------------------------------------------------------------
-All students must replace or augment the contents of this Readme.md in a clear 
-manner with the following:
+Building it, I was able to confirm that ```TraceRay``` kernel was using 63 registers.
 
-* A brief description of the project and the specific features you implemented.
-* At least one screenshot of your project running.
-* A 30 second or longer video of your project running.  To create the video you
-  can use http://www.microsoft.com/expression/products/Encoder4_Overview.aspx 
-* A performance evaluation (described in detail below).
+![63 Registers](https://raw.github.com/takfuruya/Project2-Pathtracer/master/3.png)
 
--------------------------------------------------------------------------------
-PERFORMANCE EVALUATION
--------------------------------------------------------------------------------
-The performance evaluation is where you will investigate how to make your CUDA
-programs more efficient using the skills you've learned in class. You must have
-performed at least one experiment on your code to investigate the positive or
-negative effects on performance. 
+	(63 registers/thread) * (512 threads/block) = 32256 registers/block
 
-One such experiment would be to investigate the performance increase involved 
-with adding a spatial data-structure to your scene data.
+My device has limitation of 16384 registers per block so I guessed this was the issue:  
+(Nsight > Windows > System Info)
 
-Another idea could be looking at the change in timing between various block
-sizes.
+![Max Registers Per Block](https://raw.github.com/takfuruya/Project2-Pathtracer/master/4.png)
 
-A good metric to track would be number of rays per second, or frames per 
-second, or number of objects displayable at 60fps.
+I changed ```num_threads_per_block``` to 128 such that,
 
-We encourage you to get creative with your tweaks. Consider places in your code
-that could be considered bottlenecks and try to improve them. 
+	(63 registers/thread) * (128 threads/block) = 8064 registers/block
 
-Each student should provide no more than a one page summary of their
-optimizations along with tables and or graphs to visually explain any
-performance differences.
+...and it worked.  
+It seems like setting "Generate GPU Debug Information" to "Yes" affects the amount of registers
+that can be used per block.
 
--------------------------------------------------------------------------------
-THIRD PARTY CODE POLICY
--------------------------------------------------------------------------------
-* Use of any third-party code must be approved by asking on the Google group.  If it is approved, all students are welcome to use it.  Generally, we approve use of third-party code that is not a core part of the project.  For example, for the ray tracer, we would approve using a third-party library for loading models, but would not approve copying and pasting a CUDA function for doing refraction.
-* Third-party code must be credited in README.md.
-* Using third-party code without its approval, including using another student's code, is an academic integrity violation, and will result in you receiving an F for the semester.
 
 -------------------------------------------------------------------------------
-SELF-GRADING
+Extra
 -------------------------------------------------------------------------------
-* On the submission date, email your grade, on a scale of 0 to 100, to Liam, liamboone+cis565@gmail.com, with a one paragraph explanation.  Be concise and realistic.  Recall that we reserve 30 points as a sanity check to adjust your grade.  Your actual grade will be (0.7 * your grade) + (0.3 * our grade).  We hope to only use this in extreme cases when your grade does not realistically reflect your work - it is either too high or too low.  In most cases, we plan to give you the exact grade you suggest.
-* Projects are not weighted evenly, e.g., Project 0 doesn't count as much as the path tracer.  We will determine the weighting at the end of the semester based on the size of each project.
 
--------------------------------------------------------------------------------
-SUBMISSION
--------------------------------------------------------------------------------
-As with the previous project, you should fork this project and work inside of your fork. Upon completion, commit your finished project back to your fork, and make a pull request to the master repository.
-You should include a README.md file in the root directory detailing the following
-
-* A brief description of the project and specific features you implemented
-* At least one screenshot of your project running, and at least one screenshot of the final rendered output of your pathtracer
-* Instructions for building and running your project if they differ from the base code
-* A link to your blog post detailing the project
-* A list of all third-party code used
+A trippy image was produced when random number generator was not seeded properly.
+![Max Registers Per Block](https://raw.github.com/takfuruya/Project2-Pathtracer/master/trippy.png)
diff --git a/anti-aliasing.png b/anti-aliasing.png
new file mode 100644
index 0000000..f1c85a8
Binary files /dev/null and b/anti-aliasing.png differ
diff --git a/renders/Thumbs.db b/renders/Thumbs.db
new file mode 100644
index 0000000..8328d82
Binary files /dev/null and b/renders/Thumbs.db differ
diff --git a/renders/color_bleeding.0.1.bmp b/renders/color_bleeding.0.1.bmp
new file mode 100644
index 0000000..ec68faa
Binary files /dev/null and b/renders/color_bleeding.0.1.bmp differ
diff --git a/renders/color_bleeding.0.10.bmp b/renders/color_bleeding.0.10.bmp
new file mode 100644
index 0000000..448b374
Binary files /dev/null and b/renders/color_bleeding.0.10.bmp differ
diff --git a/renders/color_bleeding.0.2.bmp b/renders/color_bleeding.0.2.bmp
new file mode 100644
index 0000000..53b00ab
Binary files /dev/null and b/renders/color_bleeding.0.2.bmp differ
diff --git a/renders/color_bleeding.0.3.bmp b/renders/color_bleeding.0.3.bmp
new file mode 100644
index 0000000..2c4825a
Binary files /dev/null and b/renders/color_bleeding.0.3.bmp differ
diff --git a/renders/color_bleeding.0.4.bmp b/renders/color_bleeding.0.4.bmp
new file mode 100644
index 0000000..8dee479
Binary files /dev/null and b/renders/color_bleeding.0.4.bmp differ
diff --git a/renders/test.0.bmp b/renders/color_bleeding.0.5.bmp
similarity index 53%
rename from renders/test.0.bmp
rename to renders/color_bleeding.0.5.bmp
index 9319138..efcc721 100644
Binary files a/renders/test.0.bmp and b/renders/color_bleeding.0.5.bmp differ
diff --git a/renders/color_bleeding.0.6.bmp b/renders/color_bleeding.0.6.bmp
new file mode 100644
index 0000000..2620943
Binary files /dev/null and b/renders/color_bleeding.0.6.bmp differ
diff --git a/renders/color_bleeding.0.7.bmp b/renders/color_bleeding.0.7.bmp
new file mode 100644
index 0000000..6ca9f50
Binary files /dev/null and b/renders/color_bleeding.0.7.bmp differ
diff --git a/renders/color_bleeding.0.8.bmp b/renders/color_bleeding.0.8.bmp
new file mode 100644
index 0000000..aabc721
Binary files /dev/null and b/renders/color_bleeding.0.8.bmp differ
diff --git a/renders/color_bleeding.0.9.bmp b/renders/color_bleeding.0.9.bmp
new file mode 100644
index 0000000..02f46df
Binary files /dev/null and b/renders/color_bleeding.0.9.bmp differ
diff --git a/scenes/color_bleeding.txt b/scenes/color_bleeding.txt
new file mode 100644
index 0000000..44ba3ef
--- /dev/null
+++ b/scenes/color_bleeding.txt
@@ -0,0 +1,89 @@
+MATERIAL 0				//white diffuse
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 1				//green diffuse
+RGB         0 1 0
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 2				//red diffuse
+RGB         1 0 0
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 3				//white light
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   15
+
+CAMERA
+RES         800 800
+FOVY        53.1301024
+ITERATIONS  1000
+FILE        color_bleeding.bmp
+frame       0
+EYE         0 0.5 1.5
+VIEW        0 0 -1
+UP          0 1 0
+
+OBJECT      0				//white sphere
+sphere
+material    0
+frame       0
+TRANS       0 0.2 0
+ROTAT       0 0 0
+SCALE       0.4 0.4 0.4
+
+OBJECT      0				//green wall
+cube
+material    1
+frame       0
+TRANS       1 0.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      2				//red floor
+cube
+material    2
+frame       0
+TRANS       0 -0.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      3				//white light
+sphere
+material    3
+frame       0
+TRANS       0.25 1 0
+ROTAT       0 0 0
+SCALE       0.1 0.1 0.1
diff --git a/scenes/color_bleeding2.txt b/scenes/color_bleeding2.txt
new file mode 100644
index 0000000..2582e2f
--- /dev/null
+++ b/scenes/color_bleeding2.txt
@@ -0,0 +1,89 @@
+MATERIAL 0				//white diffuse
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 1				//green diffuse
+RGB         0 1 0
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 2				//red diffuse
+RGB         1 0 0
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 3				//white light
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   1
+
+CAMERA
+RES         800 800
+FOVY        53.1301024
+ITERATIONS  100
+FILE        color_bleeding.bmp
+frame       0
+EYE         0 0.5 1.5
+VIEW        0 0 -1
+UP          0 1 0
+
+OBJECT      0				//white sphere
+sphere
+material    0
+frame       0
+TRANS       0 0.2 0
+ROTAT       0 0 0
+SCALE       0.4 0.4 0.4
+
+OBJECT      1				//green wall
+cube
+material    1
+frame       0
+TRANS       1 0.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      2				//red floor
+cube
+material    2
+frame       0
+TRANS       0 -0.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      3				//white light
+cube
+material    3
+frame       0
+TRANS       0.25 1 0
+ROTAT       0 0 0
+SCALE       0.1 0.1 0.1
\ No newline at end of file
diff --git a/scenes/cornell_box.txt b/scenes/cornell_box.txt
new file mode 100644
index 0000000..6e6b546
--- /dev/null
+++ b/scenes/cornell_box.txt
@@ -0,0 +1,137 @@
+MATERIAL 0				//white diffuse
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 1				//red diffuse
+RGB         1 0 0
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 2				//green diffuse
+RGB         0 1 0
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 3				//white light
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   15
+
+MATERIAL 4				//mirror
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        1
+REFR        0
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 4				//glass
+RGB         1 1 1
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        1
+REFRIOR     0
+SCATTER     0
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+CAMERA
+RES         800 800
+FOVY        53.1301024
+ITERATIONS  200
+FILE        color_bleeding.bmp
+frame       0
+EYE         0 0.5 1.5
+VIEW        0 0 -1
+UP          0 1 0
+
+OBJECT      0				//floor
+cube
+material    0
+frame       0
+TRANS       0 -0.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      1				//left wall
+cube
+material    1
+frame       0
+TRANS       -1 0.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      2				//right wall
+cube
+material    2
+frame       0
+TRANS       1 0.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      3				//back wall
+cube
+material    0
+frame       0
+TRANS       0 0.5 -1
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      4				//ceiling
+cube
+material    0
+frame       0
+TRANS       0 1.5 0
+ROTAT       0 0 0
+SCALE       1 1 1
+
+OBJECT      5				//top light
+cube
+material    3
+frame       0
+TRANS       0 1.49 0
+ROTAT       0 0 0
+SCALE       0.2 1 0.2
+
+OBJECT      6				//sphere
+sphere
+material    4
+frame       0
+TRANS       0 0.2 0
+ROTAT       0 0 0
+SCALE       0.4 0.4 0.4
diff --git a/scenes/cube.txt b/scenes/cube.txt
new file mode 100644
index 0000000..bfe4b22
--- /dev/null
+++ b/scenes/cube.txt
@@ -0,0 +1,29 @@
+MATERIAL 0				//white diffuse
+RGB         1 1 1       
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+CAMERA
+RES         800 800
+FOVY        53.1301024
+ITERATIONS  100
+FILE        test.bmp
+frame       0
+EYE         0.5 0.5 1
+VIEW        0 0 -1
+UP          0 1 0
+
+OBJECT      0
+cube
+material    0 
+frame       0
+TRANS       0.5 0.5 0
+ROTAT       45 45 0
+SCALE       0.2 0.2 0.2
diff --git a/src/interactions.h b/src/interactions.h
index 6561796..1b503b3 100755
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -40,6 +40,27 @@ __host__ __device__ glm::vec3 calculateRandomDirectionInHemisphere(glm::vec3 nor
     
 }
 
+
+__host__ __device__ glm::vec3 UniformRandomHemisphereDirection(glm::vec3& n, float random_seed)
+{
+	// http://mathworld.wolfram.com/SpherePointPicking.html
+	thrust::default_random_engine rng(hash(random_seed));
+	thrust::uniform_real_distribution<float> rnd01(0.0f, 1.0f);
+	float theta = rnd01(rng) * 2 * PI;	// 0 to 2pi
+	float u = 2 * rnd01(rng) - 1;			// -1 to 1
+	glm::vec3 v;
+	
+	// Set v.
+	{
+		float temp = sqrt(1 - u * u);
+		v.x = temp * cos(theta);
+		v.y = temp * sin(theta);
+		v.z = u;
+	}
+
+	return ( glm::dot(v, n) > 0.0f ? v : -v );
+}
+
 //TODO: IMPLEMENT THIS FUNCTION
 //Now that you know how cosine weighted direction generation works, try implementing non-cosine (uniform) weighted random direction generation.
 //This should be much easier than if you had to implement calculateRandomDirectionInHemisphere.
diff --git a/src/intersections.h b/src/intersections.h
index a6b9469..0562415 100755
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -17,10 +17,13 @@ __host__ __device__ glm::vec3 getPointOnRay(ray r, float t);
 __host__ __device__ glm::vec3 multiplyMV(cudaMat4 m, glm::vec4 v);
 __host__ __device__ glm::vec3 getSignOfRay(ray r);
 __host__ __device__ glm::vec3 getInverseDirectionOfRay(ray r);
+__host__ __device__ float geomIntersectionTest(staticGeom s_geom, ray r, glm::vec3& intersectionPoint, glm::vec3& normal);
 __host__ __device__ float boxIntersectionTest(staticGeom sphere, ray r, glm::vec3& intersectionPoint, glm::vec3& normal);
 __host__ __device__ float boxIntersectionTest(glm::vec3 boxMin, glm::vec3 boxMax, staticGeom box, ray r, glm::vec3& intersectionPoint, glm::vec3& normal);
 __host__ __device__ float sphereIntersectionTest(staticGeom sphere, ray r, glm::vec3& intersectionPoint, glm::vec3& normal);
+__host__ __device__ glm::vec3 getRandomPointOnGeom(staticGeom& s_geom, float randomSeed);
 __host__ __device__ glm::vec3 getRandomPointOnCube(staticGeom cube, float randomSeed);
+__host__ __device__ glm::vec3 getRandomPointOnSphere(staticGeom sphere, float randomSeed);
 
 //Handy dandy little hashing function that provides seeds for random number generation
 __host__ __device__ unsigned int hash(unsigned int a){
@@ -69,6 +72,21 @@ __host__ __device__ glm::vec3 getSignOfRay(ray r){
   return glm::vec3((int)(inv_direction.x < 0), (int)(inv_direction.y < 0), (int)(inv_direction.z < 0));
 }
 
+__host__ __device__ float geomIntersectionTest(staticGeom s_geom, ray r, glm::vec3& intersectionPoint, glm::vec3& normal)
+{
+	switch (s_geom.type)
+    {
+      case SPHERE:
+        return sphereIntersectionTest(s_geom, r, intersectionPoint, normal);
+      case CUBE:
+        return boxIntersectionTest(s_geom, r, intersectionPoint, normal);
+      case MESH:
+        return -1.0f;
+	  default:
+		return -1.0f;
+    }
+}
+
 //Wrapper for cube intersection test for testing against unit cubes
 __host__ __device__  float boxIntersectionTest(staticGeom box, ray r, glm::vec3& intersectionPoint, glm::vec3& normal){
   return boxIntersectionTest(glm::vec3(-.5,-.5,-.5), glm::vec3(.5,.5,.5), box, r, intersectionPoint, normal);
@@ -215,6 +233,21 @@ __host__ __device__ glm::vec3 getRadiuses(staticGeom geom){
     return glm::vec3(xradius, yradius, zradius);
 }
 
+__device__ glm::vec3 getRandomPointOnGeom(staticGeom& s_geom, float randomSeed)
+{
+	switch ( s_geom.type )
+	{
+		case SPHERE:
+			return getRandomPointOnSphere(s_geom, randomSeed);
+		case CUBE:
+			return getRandomPointOnCube(s_geom, randomSeed);
+		case MESH:
+			return glm::vec3(0.0f);
+		default:
+			return glm::vec3(0.0f);
+	}
+}
+
 //LOOK: Example for generating a random point on an object using thrust.
 //Generates a random point on a given cube
 __host__ __device__ glm::vec3 getRandomPointOnCube(staticGeom cube, float randomSeed){
diff --git a/src/main.cpp b/src/main.cpp
index 81836b1..960d308 100755
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,318 +1,110 @@
 // CIS565 CUDA Raytracer: A parallel raytracer for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
-// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
+// Written by Takashi Furuya
 // This file includes code from:
 //       Rob Farber for CUDA-GL interop, from CUDA Supercomputing For The Masses: http://www.drdobbs.com/architecture-and-design/cuda-supercomputing-for-the-masses-part/222600097
 //       Varun Sampath and Patrick Cozzi for GLSL Loading, from CIS565 Spring 2012 HW5 at the University of Pennsylvania: http://cis565-spring-2012.github.com/
 //       Yining Karl Li's TAKUA Render, a massively parallel pathtracing renderer: http://www.yiningkarlli.com
 
 #include "main.h"
+#include "main_initializer.h"
+#include "main_runtime.h"
+#include <GL/glut.h>
 
-//-------------------------------
-//-------------MAIN--------------
-//-------------------------------
-
-int main(int argc, char** argv){
-
-  #ifdef __APPLE__
-	  // Needed in OSX to force use of OpenGL3.2 
-	  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 3);
-	  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 2);
-	  glfwOpenWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
-	  glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
-  #endif
-
-  // Set up pathtracer stuff
-  bool loadedScene = false;
-  finishedRender = false;
-
-  targetFrame = 0;
-  singleFrameMode = false;
-
-  // Load scene file
-  for(int i=1; i<argc; i++){
-    string header; string data;
-    istringstream liness(argv[i]);
-    getline(liness, header, '='); getline(liness, data, '=');
-    if(strcmp(header.c_str(), "scene")==0){
-      renderScene = new scene(data);
-      loadedScene = true;
-    }else if(strcmp(header.c_str(), "frame")==0){
-      targetFrame = atoi(data.c_str());
-      singleFrameMode = true;
-    }
-  }
-
-  if(!loadedScene){
-    cout << "Error: scene file needed!" << endl;
-    return 0;
-  }
-
-  // Set up camera stuff from loaded pathtracer settings
-  iterations = 0;
-  renderCam = &renderScene->renderCam;
-  width = renderCam->resolution[0];
-  height = renderCam->resolution[1];
-
-  if(targetFrame>=renderCam->frames){
-    cout << "Warning: Specified target frame is out of range, defaulting to frame 0." << endl;
-    targetFrame = 0;
-  }
+using namespace std;
 
-  // Launch CUDA/GL
+void initVAO();
+GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath);
 
-  #ifdef __APPLE__
-	init();
-  #else
-	init(argc, argv);
-  #endif
 
-  initCuda();
+int main(int argc, char** argv)
+{
+	// Set up pathtracer.
+	bool is_scene_loaded	= false;
+	is_render_done			= false;
+	target_frame			= 0;
+	is_single_frame_mode	= false;
+	iterations				= 0;
 
-  initVAO();
-  initTextures();
-
-  GLuint passthroughProgram;
-  passthroughProgram = initShader("shaders/passthroughVS.glsl", "shaders/passthroughFS.glsl");
-
-  glUseProgram(passthroughProgram);
-  glActiveTexture(GL_TEXTURE0);
+	// Read command line arguments and load scene file.
+	for ( int i=1; i<argc; ++i )
+	{
+		// header=data (e.g. scene=my_scene.txt)
+		string header, data;
+		istringstream liness(argv[i]);
+		getline(liness, header, '=');
+		getline(liness, data, '=');
 
-  #ifdef __APPLE__
-	  // send into GLFW main loop
-	  while(1){
-		display();
-		if (glfwGetKey(GLFW_KEY_ESC) == GLFW_PRESS || !glfwGetWindowParam( GLFW_OPENED )){
-				exit(0);
+		if ( strcmp(header.c_str(), "scene") == 0 )
+		{
+			render_scene = new scene(data);
+			is_scene_loaded = true;
+		}
+		else if ( strcmp(header.c_str(), "frame") == 0 )
+		{
+			target_frame = atoi(data.c_str());
+			is_single_frame_mode = true;
 		}
-	  }
-
-	  glfwTerminate();
-  #else
-	  glutDisplayFunc(display);
-	  glutKeyboardFunc(keyboard);
-
-	  glutMainLoop();
-  #endif
-  return 0;
-}
-
-//-------------------------------
-//---------RUNTIME STUFF---------
-//-------------------------------
-
-void runCuda(){
-
-  // Map OpenGL buffer object for writing from CUDA on a single GPU
-  // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
-  
-  if(iterations<renderCam->iterations){
-    uchar4 *dptr=NULL;
-    iterations++;
-    cudaGLMapBufferObject((void**)&dptr, pbo);
-  
-    //pack geom and material arrays
-    geom* geoms = new geom[renderScene->objects.size()];
-    material* materials = new material[renderScene->materials.size()];
-    
-    for(int i=0; i<renderScene->objects.size(); i++){
-      geoms[i] = renderScene->objects[i];
-    }
-    for(int i=0; i<renderScene->materials.size(); i++){
-      materials[i] = renderScene->materials[i];
-    }
-    
-  
-    // execute the kernel
-    cudaRaytraceCore(dptr, renderCam, targetFrame, iterations, materials, renderScene->materials.size(), geoms, renderScene->objects.size() );
-    
-    // unmap buffer object
-    cudaGLUnmapBufferObject(pbo);
-  }else{
-
-    if(!finishedRender){
-      //output image file
-      image outputImage(renderCam->resolution.x, renderCam->resolution.y);
-
-      for(int x=0; x<renderCam->resolution.x; x++){
-        for(int y=0; y<renderCam->resolution.y; y++){
-          int index = x + (y * renderCam->resolution.x);
-          outputImage.writePixelRGB(renderCam->resolution.x-1-x,y,renderCam->image[index]);
-        }
-      }
-      
-      gammaSettings gamma;
-      gamma.applyGamma = true;
-      gamma.gamma = 1.0;
-      gamma.divisor = 1.0; //renderCam->iterations;
-      outputImage.setGammaSettings(gamma);
-      string filename = renderCam->imageName;
-      string s;
-      stringstream out;
-      out << targetFrame;
-      s = out.str();
-      utilityCore::replaceString(filename, ".bmp", "."+s+".bmp");
-      utilityCore::replaceString(filename, ".png", "."+s+".png");
-      outputImage.saveImageRGB(filename);
-      cout << "Saved frame " << s << " to " << filename << endl;
-      finishedRender = true;
-      if(singleFrameMode==true){
-        cudaDeviceReset(); 
-        exit(0);
-      }
-    }
-    if(targetFrame<renderCam->frames-1){
-
-      //clear image buffer and move onto next frame
-      targetFrame++;
-      iterations = 0;
-      for(int i=0; i<renderCam->resolution.x*renderCam->resolution.y; i++){
-        renderCam->image[i] = glm::vec3(0,0,0);
-      }
-      cudaDeviceReset(); 
-      finishedRender = false;
-    }
-  }
-  
-}
-
-#ifdef __APPLE__
-
-	void display(){
-		runCuda();
-
-		string title = "CIS565 Render | " + utilityCore::convertIntToString(iterations) + " Iterations";
-		glfwSetWindowTitle(title.c_str());
-
-		glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
-		glBindTexture(GL_TEXTURE_2D, displayImage);
-		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
-			  GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-
-		glClear(GL_COLOR_BUFFER_BIT);   
-
-		// VAO, shader program, and texture already bound
-		glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
-
-		glfwSwapBuffers();
-	}
-
-#else
-
-	void display(){
-		runCuda();
-
-		string title = "565Raytracer | " + utilityCore::convertIntToString(iterations) + " Iterations";
-		glutSetWindowTitle(title.c_str());
-
-		glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
-		glBindTexture(GL_TEXTURE_2D, displayImage);
-		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
-			  GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-
-		glClear(GL_COLOR_BUFFER_BIT);   
-
-		// VAO, shader program, and texture already bound
-		glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
-
-		glutPostRedisplay();
-		glutSwapBuffers();
 	}
 
-	void keyboard(unsigned char key, int x, int y)
+	if ( !is_scene_loaded )
 	{
-		std::cout << key << std::endl;
-		switch (key) 
-		{
-		   case(27):
-			   exit(1);
-			   break;
-		}
+		cout << "Error: scene file needed!" << endl;
+		return 0;
 	}
 
-#endif
+	camera& cam		= render_scene->renderCam;
+	int width		= cam.resolution.x;
+	int height		= cam.resolution.y;
 
+	if ( target_frame >= cam.frames )
+	{
+		cout << "Warning: Specified target frame is out of range, ";
+		cout << "defaulting to frame 0." << endl;
+		target_frame = 0;
+	}
 
+	// Initialize GLUT - Create OpenGL rendering context.
+	glutInit(&argc, argv);
+	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
+	glutInitWindowSize(width, height);
+	glutCreateWindow("565Raytracer");
 
+	// Initialize GLEW
+	GLenum err = glewInit();
+	if ( GLEW_OK != err )
+	{
+		cout << "glewInit failed, aborting." << endl;
+		exit(1);
+	}
 
-//-------------------------------
-//----------SETUP STUFF----------
-//-------------------------------
+	// Register GL buffer with CUDA
+	initCuda(&pbo, &displayImage, width, height);
+	
+	//runCuda();
 
-#ifdef __APPLE__
-	void init(){
+	initVAO();
+	
+	GLuint passthroughProgram;
+	passthroughProgram = initShader("shaders/passthroughVS.glsl", "shaders/passthroughFS.glsl");
 
-		if (glfwInit() != GL_TRUE){
-			shut_down(1);      
-		}
+	glUseProgram(passthroughProgram);
+	glActiveTexture(GL_TEXTURE0);
 
-		// 16 bit color, no depth, alpha or stencil buffers, windowed
-		if (glfwOpenWindow(width, height, 5, 6, 5, 0, 0, 0, GLFW_WINDOW) != GL_TRUE){
-			shut_down(1);
-		}
+	// Register window callbacks.
+	glutDisplayFunc(display);
+	glutKeyboardFunc(keyboard);
+	glutSpecialFunc(SpecialInput);
 
-		// Set up vertex array object, texture stuff
-		initVAO();
-		initTextures();
-	}
-#else
-	void init(int argc, char* argv[]){
-		glutInit(&argc, argv);
-		glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
-		glutInitWindowSize(width, height);
-		glutCreateWindow("565Raytracer");
-
-		// Init GLEW
-		glewInit();
-		GLenum err = glewInit();
-		if (GLEW_OK != err)
-		{
-			/* Problem: glewInit failed, something is seriously wrong. */
-			std::cout << "glewInit failed, aborting." << std::endl;
-			exit (1);
-		}
+	// Start main rendering loop.
+	glutMainLoop();
 
-		initVAO();
-		initTextures();
-	}
-#endif
-
-void initPBO(GLuint* pbo){
-  if (pbo) {
-    // set up vertex data parameter
-    int num_texels = width*height;
-    int num_values = num_texels * 4;
-    int size_tex_data = sizeof(GLubyte) * num_values;
-    
-    // Generate a buffer ID called a PBO (Pixel Buffer Object)
-    glGenBuffers(1,pbo);
-    // Make this the current UNPACK buffer (OpenGL is state-based)
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
-    // Allocate data for the buffer. 4-channel 8-bit image
-    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
-    cudaGLRegisterBufferObject( *pbo );
-  }
+	return 0;
 }
 
-void initCuda(){
-  // Use device with highest Gflops/s
-  cudaGLSetGLDevice( compat_getMaxGflopsDeviceId() );
 
-  initPBO(&pbo);
 
-  // Clean up on program exit
-  atexit(cleanupCuda);
 
-  runCuda();
-}
 
-void initTextures(){
-    glGenTextures(1,&displayImage);
-    glBindTexture(GL_TEXTURE_2D, displayImage);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
-        GL_UNSIGNED_BYTE, NULL);
-}
 
 void initVAO(void){
     GLfloat vertices[] =
@@ -362,37 +154,4 @@ GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath){
     }
 
     return program;
-}
-
-//-------------------------------
-//---------CLEANUP STUFF---------
-//-------------------------------
-
-void cleanupCuda(){
-  if(pbo) deletePBO(&pbo);
-  if(displayImage) deleteTexture(&displayImage);
-}
-
-void deletePBO(GLuint* pbo){
-  if (pbo) {
-    // unregister this buffer object with CUDA
-    cudaGLUnregisterBufferObject(*pbo);
-    
-    glBindBuffer(GL_ARRAY_BUFFER, *pbo);
-    glDeleteBuffers(1, pbo);
-    
-    *pbo = (GLuint)NULL;
-  }
-}
-
-void deleteTexture(GLuint* tex){
-    glDeleteTextures(1, tex);
-    *tex = (GLuint)NULL;
-}
- 
-void shut_down(int return_code){
-  #ifdef __APPLE__
-	glfwTerminate();
-  #endif
-  exit(return_code);
-}
+}
\ No newline at end of file
diff --git a/src/main.h b/src/main.h
index 0bab7cb..a0dbff6 100755
--- a/src/main.h
+++ b/src/main.h
@@ -8,50 +8,25 @@
 #ifndef MAIN_H
 #define MAIN_H
 
-#ifdef __APPLE__
-	#include <GL/glfw.h>
-#else
-	#include <GL/glew.h>
-	#include <GL/glut.h>
-#endif
+
+#include <GL/glew.h>
 
 #include <stdlib.h>
-#include <cuda_runtime.h>
-#include <cuda_gl_interop.h>
+
 #include <string>
 #include <iostream>
 #include <sstream>
 #include <fstream>
 #include "glslUtility.h"
-#include "sceneStructs.h"
 #include "glm/glm.hpp"
-#include "image.h"
-#include "raytraceKernel.h"
-#include "utilities.h"
 #include "scene.h"
 
-#if CUDA_VERSION >= 5000
-    #include <helper_cuda.h>
-    #include <helper_cuda_gl.h>
-    #define compat_getMaxGflopsDeviceId() gpuGetMaxGflopsDeviceId() 
-#else
-    #include <cutil_inline.h>
-    #include <cutil_gl_inline.h>
-    #define compat_getMaxGflopsDeviceId() cutGetMaxGflopsDeviceId()
-#endif
-
-using namespace std;
-
-//-------------------------------
-//----------PATHTRACER-----------
-//-------------------------------
 
-scene* renderScene;
-camera* renderCam;
-int targetFrame;
+scene* render_scene;
+int target_frame;
 int iterations;
-bool finishedRender;
-bool singleFrameMode;
+bool is_render_done;
+bool is_single_frame_mode;
 
 //-------------------------------
 //------------GL STUFF-----------
@@ -63,54 +38,4 @@ const char *attributeLocations[] = { "Position", "Tex" };
 GLuint pbo = (GLuint)NULL;
 GLuint displayImage;
 
-//-------------------------------
-//----------CUDA STUFF-----------
-//-------------------------------
-
-int width=800; int height=800;
-
-//-------------------------------
-//-------------MAIN--------------
-//-------------------------------
-
-int main(int argc, char** argv);
-
-//-------------------------------
-//---------RUNTIME STUFF---------
-//-------------------------------
-
-void runCuda();
-
-#ifdef __APPLE__
-	void display();
-#else
-	void display();
-	void keyboard(unsigned char key, int x, int y);
-#endif
-
-//-------------------------------
-//----------SETUP STUFF----------
-//-------------------------------
-
-#ifdef __APPLE__
-	void init();
-#else
-	void init(int argc, char* argv[]);
-#endif
-
-void initPBO(GLuint* pbo);
-void initCuda();
-void initTextures();
-void initVAO();
-GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath);
-
-//-------------------------------
-//---------CLEANUP STUFF---------
-//-------------------------------
-
-void cleanupCuda();
-void deletePBO(GLuint* pbo);
-void deleteTexture(GLuint* tex);
-void shut_down(int return_code);
-
 #endif
diff --git a/src/main_initializer.cpp b/src/main_initializer.cpp
new file mode 100644
index 0000000..8ed0028
--- /dev/null
+++ b/src/main_initializer.cpp
@@ -0,0 +1,92 @@
+
+#include "main_initializer.h"
+#include <GL/glew.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+#if CUDA_VERSION >= 5000
+	#include <helper_cuda.h>
+	#include <helper_cuda_gl.h>
+	#define compat_getMaxGflopsDeviceId() gpuGetMaxGflopsDeviceId() 
+#else
+	#include <cutil_inline.h>
+	#include <cutil_gl_inline.h>
+	#define compat_getMaxGflopsDeviceId() cutGetMaxGflopsDeviceId()
+#endif
+
+// =============================================================================
+// ===  OpenGL x Cuda Initialization  ==========================================
+// =============================================================================
+
+void initCuda(GLuint* pbo, GLuint* texture_id, int width, int height)
+{
+	// Use device with highest Gflops/s.
+	cudaGLSetGLDevice( compat_getMaxGflopsDeviceId() );
+
+	initPBO(pbo, width, height);
+	initTexture(texture_id, width, height);
+
+	// Clean up on program exit.
+	//atexit(cleanupCuda);
+}
+
+void initPBO(GLuint* pbo, int width, int height)
+{
+	if ( !pbo ) return;
+
+	// Set up vertex data parameter.
+	int num_texels = width*height;
+	int num_values = num_texels * 4;
+	int size_tex_data = sizeof(GLubyte) * num_values;
+    
+	// Generate GL buffer (4-channel 8-bit image)
+	// and register it to be shared with CUDA.
+	glGenBuffers(1, pbo);
+	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
+	glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
+	cudaGLRegisterBufferObject(*pbo);
+}
+
+
+void initTexture(GLuint* texture_id, int width, int height)
+{
+	glEnable(GL_TEXTURE_2D);
+	glGenTextures(1, texture_id);
+	glBindTexture(GL_TEXTURE_2D, *texture_id);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
+		GL_UNSIGNED_BYTE, NULL);
+}
+
+
+
+// =============================================================================
+// ===  Cleanup  ===============================================================
+// =============================================================================
+
+void cleanupCuda()
+{
+	if ( pbo ) deletePBO(&pbo);
+	if ( displayImage ) deleteTexture(&displayImage);
+}
+
+void deletePBO(GLuint* pbo)
+{
+	if ( !pbo ) return;
+
+	// Unregister this buffer object with CUDA.
+	cudaGLUnregisterBufferObject(*pbo);
+    
+	glBindBuffer(GL_ARRAY_BUFFER, *pbo);
+	glDeleteBuffers(1, pbo);
+    
+	*pbo = (GLuint)NULL;
+}
+
+void deleteTexture(GLuint* tex)
+{
+	glDeleteTextures(1, tex);
+	*tex = (GLuint)NULL;
+}
+
diff --git a/src/main_initializer.h b/src/main_initializer.h
new file mode 100644
index 0000000..7feae05
--- /dev/null
+++ b/src/main_initializer.h
@@ -0,0 +1,21 @@
+
+#ifndef MAIN_INITIALIZER_H
+#define MAIN_INITIALIZER_H
+
+#include <GL/glew.h>
+
+extern GLuint pbo;
+extern GLuint displayImage;
+
+// Initialization
+void initCuda(GLuint* pbo, GLuint* texture_id, int width, int height);
+void initPBO(GLuint* pbo, int width, int height);
+void initTexture(GLuint* texture_id, int width, int height);
+
+// Cleanup
+void cleanupCuda();
+void deletePBO(GLuint* pbo);
+void deleteTexture(GLuint* tex);
+
+
+#endif /* MAIN_INITIALIZER_H */
diff --git a/src/main_runtime.cpp b/src/main_runtime.cpp
new file mode 100644
index 0000000..0968b84
--- /dev/null
+++ b/src/main_runtime.cpp
@@ -0,0 +1,176 @@
+
+#include "main_runtime.h"
+#include "sceneStructs.h"
+#include "raytraceKernel.h"
+#include "image.h"
+#include "utilities.h"
+#include <GL/glew.h>
+#include <GL/glut.h>
+#include <cuda_gl_interop.h>
+#include <string>
+#include <iostream>
+
+using namespace std;
+
+void display()
+{
+	camera* cam = &(render_scene->renderCam);
+	int width = cam->resolution.x;
+	int height = cam->resolution.y;
+
+	if ( iterations < cam->iterations )
+	{
+		// Compute image (single frame) in cuda and write to OpenGL buffer.
+		runCuda();
+	}
+	else
+	{
+		if ( !is_render_done )
+		{
+			SaveFrameToImageFile();
+			if( is_single_frame_mode )
+			{
+				cudaDeviceReset(); 
+				exit(0);
+			}
+			is_render_done = true;
+		}
+		if ( target_frame < cam->frames-1 )
+		{
+			//clear image buffer and move onto next frame
+			target_frame++;
+			iterations = 0;
+			for ( int i=0; i<width*height; ++i )
+			{
+				cam->image[i] = glm::vec3(0,0,0);
+			}
+			cudaDeviceReset(); 
+			is_render_done = false;
+		}
+	}
+
+	string title = "565Raytracer | " + utilityCore::convertIntToString(iterations) + " Iterations";
+	glutSetWindowTitle(title.c_str());
+
+	// Bind texture to OpenGL buffer.
+	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
+	glBindTexture(GL_TEXTURE_2D, displayImage);
+	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
+			GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+
+	glClear(GL_COLOR_BUFFER_BIT);   
+	
+	// Draw a quad that specifies the texture coordinates at each corner.
+	// VAO, shader program, and texture already bound
+	glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
+
+	glutPostRedisplay();
+	glutSwapBuffers();
+}
+
+
+void runCuda()
+{
+	// Map OpenGL buffer object for writing from CUDA on a single GPU.
+	// No data is moved (Win & Linux).
+	// When mapped to CUDA, OpenGL should not use this buffer.
+	
+	uchar4 *dptr = NULL;
+	iterations++;
+
+	// 1) Map OpenGL buffer to CUDA memory.
+	cudaGLMapBufferObject((void**)&dptr, pbo);
+		
+	// Pack geom and material arrays for passing to kernel.
+	geom* geoms = new geom[render_scene->objects.size()];
+	material* materials = new material[render_scene->materials.size()];
+		
+	for(int i=0; i<render_scene->objects.size(); i++)
+	{
+		geoms[i] = render_scene->objects[i];
+	}
+	for(int i=0; i<render_scene->materials.size(); i++)
+	{
+		materials[i] = render_scene->materials[i];
+	}
+
+	// 2) Compute (execute kernel) & write image from CUDA to
+	//    OpenGL buffer.
+	cudaRaytraceCore(dptr, &(render_scene->renderCam), target_frame, iterations, materials, render_scene->materials.size(), geoms, render_scene->objects.size() );
+		
+	// 3) Unmap OpenGL buffer.
+	cudaGLUnmapBufferObject(pbo);
+
+	delete [] geoms;
+	delete [] materials;
+}
+
+void SaveFrameToImageFile()
+{
+	camera* cam = &(render_scene->renderCam);
+
+	image outputImage(cam->resolution.x, cam->resolution.y);
+
+	for(int x=0; x<cam->resolution.x; x++)
+	{
+		for(int y=0; y<cam->resolution.y; y++)
+		{
+			int index = x + (y * cam->resolution.x);
+			outputImage.writePixelRGB(cam->resolution.x-1-x,y,cam->image[index]);
+		}
+	}
+      
+	gammaSettings gamma;
+	gamma.applyGamma = true;
+	gamma.gamma = 1.0/2.2;
+	gamma.divisor = 1.0f;//cam->iterations;
+	outputImage.setGammaSettings(gamma);
+	string filename = cam->imageName;
+	string s;
+	stringstream out;
+	out << target_frame;
+	s = out.str();
+	utilityCore::replaceString(filename, ".bmp", "."+s+".bmp");
+	utilityCore::replaceString(filename, ".png", "."+s+".png");
+	outputImage.saveImageRGB(filename);
+	cout << "Saved frame " << s << " to " << filename << endl;
+}
+
+const unsigned char KEY_ESC = 27;
+const unsigned char KEY_W = 'w';
+
+void keyboard(unsigned char key, int x, int y)
+{
+	std::cout << key << std::endl;
+	switch (key) 
+	{
+		case KEY_ESC:
+			exit(1);
+			break;
+		case KEY_W:
+			exit(1);
+			break;
+	}
+}
+
+void SpecialInput(int key, int x, int y)
+{
+	glm::vec3 increment(0.0f);
+
+	switch(key)
+	{
+		case GLUT_KEY_UP:
+			increment.z = -0.1f;
+			break;
+		case GLUT_KEY_DOWN:
+			increment.z = 0.1f;
+			break;
+		case GLUT_KEY_LEFT:
+			increment.x = -0.1f;
+			break;
+		case GLUT_KEY_RIGHT:
+			increment.x = 0.1f;
+			break;
+	}
+	*(render_scene->renderCam.positions) = *(render_scene->renderCam.positions) + increment;
+}
diff --git a/src/main_runtime.h b/src/main_runtime.h
new file mode 100644
index 0000000..f8b56c8
--- /dev/null
+++ b/src/main_runtime.h
@@ -0,0 +1,24 @@
+// Runtime calls (e.g. main loop & interaction)
+
+#ifndef MAIN_RUNTIME_H
+#define MAIN_RUNTIME_H
+
+#include <GL/glew.h>
+#include "sceneStructs.h"
+#include "scene.h"
+
+extern scene* render_scene;
+extern int target_frame;
+extern int iterations;
+extern bool is_render_done;
+extern bool is_single_frame_mode;
+extern GLuint pbo;
+extern GLuint displayImage;
+
+void display();
+void runCuda();
+void SaveFrameToImageFile();
+void keyboard(unsigned char key, int x, int y);
+void SpecialInput(int key, int x, int y);
+
+#endif /* MAIN_RUNTIME_H */
diff --git a/src/raytraceKernel.cu b/src/raytraceKernel.cu
index 87a65a6..5cf5e60 100755
--- a/src/raytraceKernel.cu
+++ b/src/raytraceKernel.cu
@@ -14,7 +14,13 @@
 #include "intersections.h"
 #include "interactions.h"
 #include <vector>
+#include <cuda_runtime_api.h>
 #include "glm/glm.hpp"
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/scan.h>
+#include <thrust/copy.h>
+#include <iostream>
 
 void checkCUDAError(const char *msg) {
   cudaError_t err = cudaGetLastError();
@@ -35,43 +41,6 @@ __host__ __device__ glm::vec3 generateRandomNumberFromThread(glm::vec2 resolutio
   return glm::vec3((float) u01(rng), (float) u01(rng), (float) u01(rng));
 }
 
-//Kernel that does the initial raycast from the camera.
-__host__ __device__ ray raycastFromCameraKernel(glm::vec2 resolution, float time, int x, int y, glm::vec3 eye, glm::vec3 view, glm::vec3 up, glm::vec2 fov){
-   
-  int index = x + (y * resolution.x);
-   
-  thrust::default_random_engine rng(hash(index*time));
-  thrust::uniform_real_distribution<float> u01(0,1);
-  
-  //standard camera raycast stuff
-  glm::vec3 E = eye;
-  glm::vec3 C = view;
-  glm::vec3 U = up;
-  float fovx = fov.x;
-  float fovy = fov.y;
-  
-  float CD = glm::length(C);
-  
-  glm::vec3 A = glm::cross(C, U);
-  glm::vec3 B = glm::cross(A, C);
-  glm::vec3 M = E+C;
-  glm::vec3 H = (A*float(CD*tan(fovx*(PI/180))))/float(glm::length(A));
-  glm::vec3 V = (B*float(CD*tan(-fovy*(PI/180))))/float(glm::length(B));
-  
-  float sx = (x)/(resolution.x-1);
-  float sy = (y)/(resolution.y-1);
-  
-  glm::vec3 P = M + (((2*sx)-1)*H) + (((2*sy)-1)*V);
-  glm::vec3 PmE = P-E;
-  glm::vec3 R = E + (float(200)*(PmE))/float(glm::length(PmE));
-  
-  glm::vec3 direction = glm::normalize(R);
-  //major performance cliff at this point, TODO: find out why!
-  ray r;
-  r.origin = eye;
-  r.direction = direction;
-  return r;
-}
 
 //Kernel that blacks out a given image buffer
 __global__ void clearImage(glm::vec2 resolution, glm::vec3* image){
@@ -117,111 +86,521 @@ __global__ void sendImageToPBO(uchar4* PBOpos, glm::vec2 resolution, glm::vec3*
   }
 }
 
-//TODO: IMPLEMENT THIS FUNCTION
-//Core raytracer kernel
-__global__ void raytraceRay(glm::vec2 resolution, float time, float bounce, cameraData cam, int rayDepth, glm::vec3* colors, 
-                            staticGeom* geoms, int numberOfGeoms, material* materials, int numberOfMaterials){
 
-  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-  int index = x + (y * resolution.x);
+__device__ glm::vec3 reflect(glm::vec3 const & I, glm::vec3 const & N)
+{
+  return I - 2.0f * glm::dot(N, I) * N;
+}
 
-  ray r = raycastFromCameraKernel(resolution, time, x, y, cam.position, cam.view, cam.up, cam.fov);
-
-  if((x<=resolution.x && y<=resolution.y)){
-
-    float MAX_DEPTH = 100000000000000000;
-    float depth = MAX_DEPTH;
-
-    for(int i=0; i<numberOfGeoms; i++){
-        glm::vec3 intersectionPoint;
-        glm::vec3 intersectionNormal;
-       if(geoms[i].type==SPHERE){
-           depth = sphereIntersectionTest(geoms[i], r, intersectionPoint, intersectionNormal);
-        }else if(geoms[i].type==CUBE){
-            depth = boxIntersectionTest(geoms[i], r, intersectionPoint, intersectionNormal);
-        }else if(geoms[i].type==MESH){
-            //triangle tests go here
-        }else{
-            //lol?
-        }
-        if(depth<MAX_DEPTH && depth>-EPSILON){
-          MAX_DEPTH = depth;
-          colors[index] = materials[geoms[i].materialid].color;
-        }
-    }
+__device__ bool isRayUnblocked(glm::vec3 const & point1, glm::vec3 const & point2, staticGeom* geoms, int numberOfGeoms)
+{
+  glm::vec3 DIRECTION(point2 - point1);
+  float DISTANCE = glm::length(DIRECTION);
+
+  // Offset start position in ray direction by small distance to prevent self collisions
+  float DELTA = 0.001f;
+  ray r;
+  r.origin = point1 + DELTA * DIRECTION;
+  r.direction = glm::normalize(DIRECTION);
+
+  for (int i=0; i<numberOfGeoms; ++i)
+  {
+	glm::vec3 intersectionPoint;
+    glm::vec3 normal;
+    float intersectionDistance = geomIntersectionTest(geoms[i], r, intersectionPoint, normal);
+
+	// Does not intersect so check next primitive
+	if (intersectionDistance <= 0.0f) continue;
 
+    // Take into consideration intersection only between the two points.
+	  if (intersectionDistance < DISTANCE) return false;
+  }
+
+  return true;
+}
+
+/*
+__global__ void raytraceRay(glm::vec2 resolution, int time, float bounce, cameraData cam, int rayDepth, glm::vec3* colors, 
+							staticGeom* geoms, int numberOfGeoms, material* materials, int numberOfMaterials, ray* d_rays)
+{
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * resolution.x);
+
+	if ( x >= resolution.x || y >= resolution.y ) return;
+	ray r = d_rays[index];
+
+	// ============================================
+	// Determine closest intersection with geometry
+	// ============================================
+	
+	float distance = -1.0f;
+	glm::vec3 intersection;
+	glm::vec3 normal;
+	int materialIdx;
+	for (int i = 0; i < numberOfGeoms; ++i)
+	{
+		float newDistance;
+		glm::vec3 newIntersection;
+		glm::vec3 newNormal;
+		switch (geoms[i].type)
+		{
+			case SPHERE:
+				newDistance = sphereIntersectionTest(geoms[i], r, newIntersection, newNormal);
+				break;
+			case CUBE:
+				newDistance = boxIntersectionTest(geoms[i], r, newIntersection, newNormal);
+				break;
+			case MESH:
+				newDistance = -1.0f;
+				break;
+		}
+		if ( newDistance < 0.0f ) continue;
+		if ( distance < 0.0f || (distance > 0.0f && newDistance < distance) )
+		{
+			distance = newDistance;
+			intersection = newIntersection;
+			normal = newNormal;
+			materialIdx = geoms[i].materialid;
+		}
+	}
+	
+	// ============================================
+	// Paint pixel
+	// ============================================
+
+	// No hit
+	if ( distance < 0.0f )
+	{
+		colors[index] = glm::vec3(0.0f, 0.0f, 0.0f);
+		//colors[index] = generateRandomNumberFromThread(resolution, time, x, y);
+		return;
+	}
+
+	// Simple local reflectance model (local illumination model formula)
+	float reflectivity = 0.0f;
+	float transmittance = 1.0f - reflectivity;
+	glm::vec3 materialColor = materials[materialIdx].color;
+	glm::vec3 reflectedColor(0.0f, 0.0f, 0.0f);
+	glm::vec3 ambientLightColor(1.0f, 1.0f, 1.0f);
+	
+	float AMBIENT_WEIGHT = 0.2f;	// Ka - Ambient reflectivity factor
+	float DIFFUSE_WEIGHT = 0.3f;	// Kd - Diffuse reflectivity factor
+	float SPECULAR_WEIGHT = 0.5f;	// Ks - Specular reflectivity factor
+
+	glm::vec3 lightColor(1.0f, 1.0f, 1.0f);
+	glm::vec3 color = AMBIENT_WEIGHT * ambientLightColor * materialColor;
+
+	thrust::default_random_engine rng(hash(index*time));
+	thrust::uniform_real_distribution<float> u01(-0.15f, 0.15f);
+	for ( int i = 0; i < 1; ++i)
+	{
+		glm::vec3 lightPosition(0.25f + (float) u01(rng), 1.0f, (float) u01(rng));
+		// Unit vector from intersection point to light source
+		glm::vec3 LIGHT_DIRECTION = glm::normalize(lightPosition - intersection);
+		// Direction of reflected light at intersection point
+		glm::vec3 LIGHT_REFLECTION = glm::normalize(reflect(-1.0f*LIGHT_DIRECTION, normal));
+
+		// Determine diffuse term
+		float diffuseTerm;
+		diffuseTerm = glm::dot(normal, LIGHT_DIRECTION);
+		diffuseTerm = glm::clamp(diffuseTerm, 0.0f, 1.0f);
+
+		// Determine specular term
+		float specularTerm = 0.0f;
+		if ( materials[materialIdx].specularExponent - 0.0f > 0.001f )
+		{
+			float SPECULAR_EXPONENT = materials[materialIdx].specularExponent;
+			glm::vec3 EYE_DIRECTION = glm::normalize(cam.position - intersection);
+			specularTerm = glm::dot(LIGHT_REFLECTION, EYE_DIRECTION);
+			specularTerm = pow(fmaxf(specularTerm, 0.0f), SPECULAR_EXPONENT);
+			specularTerm = glm::clamp(specularTerm, 0.0f, 1.0f);
+		}
 
+		if (isRayUnblocked(intersection, lightPosition, geoms, numberOfGeoms))
+		{
+			color += DIFFUSE_WEIGHT * lightColor * materialColor * diffuseTerm / 1.0f;
+			color += SPECULAR_WEIGHT * lightColor * specularTerm / 1.0f;
+		}
+	}
 
-    //colors[index] = generateRandomNumberFromThread(resolution, time, x, y);
-   }
+	glm::vec3 new_color = reflectivity*reflectedColor + transmittance*color;
+
+	if ( time > 1 )
+	{
+		colors[index] += (new_color - colors[index]) / (float)time;
+		return;
+	}
+	colors[index] = new_color;
 }
+*/
 
+// Requires:
+//		x = 0 to width-1
+//		y = 0 to height-1
+// Jittering based only on random_seed (not x or y).
+__host__ __device__ glm::vec3 GetRayDirectionFromCamera(const cameraData& cam, int x, int y, int random_seed)
+{
+	float random1, random2;         // Random # between 0 and 1 (from random_seed).
 
-//TODO: FINISH THIS FUNCTION
-// Wrapper for the __global__ call that sets up the kernel calls and does a ton of memory management
-void cudaRaytraceCore(uchar4* PBOpos, camera* renderCam, int frame, int iterations, material* materials, int numberOfMaterials, geom* geoms, int numberOfGeoms){
-  
-  int traceDepth = 1; //determines how many bounces the raytracer traces
+	// Set random numbers.
+	{
+		thrust::default_random_engine rng(hash(random_seed));
+		thrust::uniform_real_distribution<float> u01(0,1);
+		random1 = u01(rng);
+		random2 = u01(rng);
+	}
 
-  // set up crucial magic
-  int tileSize = 8;
-  dim3 threadsPerBlock(tileSize, tileSize);
-  dim3 fullBlocksPerGrid((int)ceil(float(renderCam->resolution.x)/float(tileSize)), (int)ceil(float(renderCam->resolution.y)/float(tileSize)));
-  
-  //send image to GPU
-  glm::vec3* cudaimage = NULL;
-  cudaMalloc((void**)&cudaimage, (int)renderCam->resolution.x*(int)renderCam->resolution.y*sizeof(glm::vec3));
-  cudaMemcpy( cudaimage, renderCam->image, (int)renderCam->resolution.x*(int)renderCam->resolution.y*sizeof(glm::vec3), cudaMemcpyHostToDevice);
+	float width = (float) cam.resolution.x;
+	float height = (float) cam.resolution.y;
+	
+	glm::vec3 c(cam.view);          // View direction (unit vector) from eye.
+	glm::vec3 e(cam.position);      // Camera center position.
+	glm::vec3 m = e + c;            // Midpoint of screen.
+	glm::vec3 u(cam.up);            // Up vector.
+	glm::vec3 a = glm::cross(c, u); // c x u TODO: make sure this is well defined
+	glm::vec3 b = glm::cross(a, c); // a x c TODO: make sure this is well defined
   
-  //package geometry and materials and sent to GPU
-  staticGeom* geomList = new staticGeom[numberOfGeoms];
-  for(int i=0; i<numberOfGeoms; i++){
-    staticGeom newStaticGeom;
-    newStaticGeom.type = geoms[i].type;
-    newStaticGeom.materialid = geoms[i].materialid;
-    newStaticGeom.translation = geoms[i].translations[frame];
-    newStaticGeom.rotation = geoms[i].rotations[frame];
-    newStaticGeom.scale = geoms[i].scales[frame];
-    newStaticGeom.transform = geoms[i].transforms[frame];
-    newStaticGeom.inverseTransform = geoms[i].inverseTransforms[frame];
-    geomList[i] = newStaticGeom;
-  }
-  
-  staticGeom* cudageoms = NULL;
-  cudaMalloc((void**)&cudageoms, numberOfGeoms*sizeof(staticGeom));
-  cudaMemcpy( cudageoms, geomList, numberOfGeoms*sizeof(staticGeom), cudaMemcpyHostToDevice);
+	glm::vec3 v;	                // Vertical vector from "m" to top of screen.
+	glm::vec3 h;	                // Horizontal vector from "m" to right of screen.
+
+	// Calculate v & h
+	{
+		float phi = cam.fov.y * PI / 180.0f / 2.0f;
+		float screen_ratio = height / width;
+		v = b * tan(phi) / (float)glm::length(b);
+		float theta = atan(glm::length(v)/screen_ratio / (float)glm::length(c));
+		h = a * (float)glm::length(c) * tan(theta) / (float)glm::length(a);
+	}
   
-  material* cudamaterials = NULL;
-  cudaMalloc((void**)&cudamaterials, numberOfMaterials*sizeof(material));
-  cudaMemcpy( cudamaterials, materials, numberOfMaterials*sizeof(material), cudaMemcpyHostToDevice);
-
-  //package camera
-  cameraData cam;
-  cam.resolution = renderCam->resolution;
-  cam.position = renderCam->positions[frame];
-  cam.view = renderCam->views[frame];
-  cam.up = renderCam->ups[frame];
-  cam.fov = renderCam->fov;
-
-  //kernel launches
-  for(int bounce = 1; bounce <= 1; ++bounce)
-  {
-  raytraceRay<<<fullBlocksPerGrid, threadsPerBlock>>>(renderCam->resolution, (float)iterations, (float)bounce, cam, traceDepth, cudaimage, cudageoms, numberOfGeoms, cudamaterials, numberOfMaterials);
-  }
-  sendImageToPBO<<<fullBlocksPerGrid, threadsPerBlock>>>(PBOpos, renderCam->resolution, cudaimage);
+	// Obtain a unit vector in the direction from the eye to a pixel point (x, y) on screen
+	float sx = (x + random1) / width;				// Without jitter: x / (width - 1.0f)
+	float sy = (y + random2) / height;				//				   y / (height - 1.0f)
+	glm::vec3 p = m - (2*sx - 1)*h - (2*sy - 1)*v;	// World position of point (x, y) on screen 
+	return glm::normalize(p-e);
+}
+
+// Initialize all rays using camera data.
+// # of rays = # of pixels
+__global__ void InitRay(cameraData cam, int random_seed, ray* d_rays, glm::vec3* d_lights, bool* d_is_ray_alive, int* d_ray_idx)
+{
+	int width = cam.resolution.x;
+	int height = cam.resolution.y;
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int idx = x + (y * width);
+
+	if ( x >= width || y >= height ) return;
+
+	d_rays[idx].origin	  = cam.position;
+	d_rays[idx].direction = GetRayDirectionFromCamera(cam, x, y, random_seed);
+	d_lights[idx]		  = glm::vec3(1.0f);
+	d_is_ray_alive[idx]	  = true;
+	d_ray_idx[idx]		  = idx;
+}
+
+
+
+// Modifies:
+//		p: Intersection point.
+//		n: Normal unit vector at intersection.
+//		material_id: Of intersected object.
+// Return true if intersected.
+__device__ bool GetClosestIntersection(ray& r, staticGeom* geoms, int num_geoms, material* materials,
+									   glm::vec3& p, glm::vec3& n, int& material_id)
+{
+	float distance = -1.0f;
+
+	for ( int i=0; i < num_geoms; ++i )
+	{
+		// Ignore emitters.
+		//if ( IsEmitter(geoms[i].materialid, materials) ) continue;
+
+		glm::vec3 new_intersection;
+		glm::vec3 new_normal;
+		float new_distance = geomIntersectionTest(geoms[i], r, new_intersection, new_normal);
+
+		if ( new_distance < 0.0f ) continue;
+		if ( distance < 0.0f || (distance > 0.0f && new_distance < distance) )
+		{
+			distance = new_distance;
+			p = new_intersection;
+			n = new_normal;
+			material_id = geoms[i].materialid;
+		}
+	}
+
+	if ( distance < 0.0f) return false;
+	return true;
+}
+
+__host__ __device__ bool IsEmitter(int id, material* materials)
+{
+	return ( materials[id].emittance > 0.5f );
+}
+
+__device__ void SetAverageColor(glm::vec3* colors, int idx, glm::vec3& new_color, int iterations)
+{
+	if ( iterations > 1 )
+	{
+		colors[idx] += (new_color - colors[idx]) / (float)iterations;
+		return;
+	}
+	colors[idx] = new_color;
+}
+
+
+__global__ void TraceRay(int iterations, int depth, int max_depth, int num_pixels, ray* d_rays, int num_rays, glm::vec3* d_lights, bool* d_is_ray_alive, int* d_ray_idx,
+		glm::vec3* colors, staticGeom* geoms, int num_geoms, material* materials, int num_materials)
+{
+	int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	/*
+	int debug_i = 0;
+	if (x == 641 && y == 177)
+	{
+		debug_i ++;
+	}
+	debug_i ++;
+	*/
+
+	if ( idx >= num_rays ) return;
+	if ( !d_is_ray_alive[idx] ) return;
+	
+	// Copy global memory to register.
+	ray ray_in = d_rays[idx];
+	glm::vec3 light = d_lights[idx];
+
+	bool is_intersected;
+	glm::vec3 p;				// Intersection point.
+	glm::vec3 n;				// Normal unit vector at intersection.
+	int material_id;			// Of intersected object.
+	
+	is_intersected = GetClosestIntersection(ray_in, geoms, num_geoms, materials, p, n, material_id);
+	
+	// No hit, return (light * bg).
+	if ( !is_intersected )
+	{
+		glm::vec3 bg_color(0.2f);
+		glm::vec3 new_color = light * bg_color;
+
+		d_is_ray_alive[idx] = false;
+		SetAverageColor(colors, d_ray_idx[idx], new_color, iterations);
+		return;
+	}
+	
+	// Hit emitter, return (light * emitter).
+	if ( IsEmitter(material_id, materials) )
+	{
+		glm::vec3 new_color = light * materials[material_id].color * materials[material_id].emittance;
+
+		d_is_ray_alive[idx] = false;
+		SetAverageColor(colors, d_ray_idx[idx], new_color, iterations);
+		return;
+	}
+
+	// Make ray_out in random direction.
+	ray ray_out;
+	//ray_out.direction = UniformRandomHemisphereDirection(n, (float) (iterations-1) * max_depth * num_pixels + depth * num_pixels + idx);
+	float xi1, xi2;
+	{
+		thrust::default_random_engine rng(hash((float) iterations * (depth+1) * idx));
+		thrust::uniform_real_distribution<float> u01(0,1);
+		xi1 = u01(rng);
+		xi2 = u01(rng);
+	}
+	if ( materials[material_id].hasReflective )
+	{
+		ray_out.direction = reflect(ray_in.direction, glm::normalize(n));
+	}
+	else
+	{
+		ray_out.direction = calculateRandomDirectionInHemisphere(glm::normalize(n), xi1, xi2);
+	}
+	ray_out.origin = p + 0.001f * ray_out.direction;
+
+	// Update light & ray.
+	d_lights[idx] = light * materials[material_id].color;
+	d_rays[idx] = ray_out;
+	
+
+
+
+	// Kill rays with negligible throughput.
+	
+	// Direct illumination.
+
+	// For each light...
+	/*
+	int num_lights = 0;
+	for ( int i=0; i < num_geoms; ++i )
+	{
+		// Ignore non-emitters.
+		if ( materials[geoms[i].materialid].emittance < 0.5f ) continue;
+		
+		++ num_lights;
+
+		// 1) Sample a point on light
+		glm::vec3 point_on_light;
+		point_on_light = getRandomPointOnGeom(geoms[i], iterations+depth);
+
+		// 2) L += [throughput] * [avg of visible lights]
+		glm::vec3 direct_L(0.0f);
+		if ( isRayUnblocked(p, point_on_light, geoms, num_geoms) )
+		{
+			direct_L += throughput * materials[geoms[i].materialid].color
+		}
+		L += direct_L / (float) num_lights;
+	}
+	
+
+	throughput = throughput * materials[material_id].color;
+
+	//glm::vec3 new_color = ;
+	SetAverageColor(colors, idx, new_color, iterations);
+	*/
+}
+
+
+__global__ void CompactRays(int* td_v, ray* d_rays, glm::vec3* d_lights, bool* d_is_ray_alive, int* d_ray_idx, int num_rays,
+	                                   ray* d_rays_copy, glm::vec3* d_lights_copy, bool* d_is_ray_alive_copy, int* d_ray_idx_copy)
+{
+	int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if ( idx >= num_rays ) return;
+	if ( !d_is_ray_alive[idx] ) return;
+	
+	int copy_idx = td_v[idx];
+	d_rays_copy[copy_idx] = d_rays[idx];
+	d_lights_copy[copy_idx] = d_lights[idx];
+	d_is_ray_alive_copy[copy_idx] = true;
+	d_ray_idx_copy[copy_idx] = d_ray_idx[idx];
+}
+
+
+
+// Wrapper for the __global__ call that sets up the kernel calls and does memory management
+void cudaRaytraceCore(uchar4* PBOpos, camera* cam, int frame, int iterations, material* materials, int num_materials, geom* geoms, int num_geoms)
+{
+	int width = cam->resolution.x;
+	int height = cam->resolution.y;
+	int num_pixels = width * height;
+
+	// Device memory size.
+	int tile_size = 8;
+	dim3 threadsPerBlock(tile_size, tile_size);
+	dim3 fullBlocksPerGrid(ceil((float)width/tile_size), ceil((float)height/tile_size));
+
+	// Copy image to GPU.
+	glm::vec3* d_image = NULL;
+	cudaMalloc((void**)&d_image, num_pixels*sizeof(glm::vec3));
+	cudaMemcpy(d_image, cam->image, num_pixels*sizeof(glm::vec3), cudaMemcpyHostToDevice);
+	
+	// Package geometry.
+	staticGeom* geomList = new staticGeom[num_geoms];
+	for ( int i=0; i<num_geoms; ++i )
+	{
+		staticGeom newStaticGeom;
+		newStaticGeom.type = geoms[i].type;
+		newStaticGeom.materialid = geoms[i].materialid;
+		newStaticGeom.translation = geoms[i].translations[frame];
+		newStaticGeom.rotation = geoms[i].rotations[frame];
+		newStaticGeom.scale = geoms[i].scales[frame];
+		newStaticGeom.transform = geoms[i].transforms[frame];
+		newStaticGeom.inverseTransform = geoms[i].inverseTransforms[frame];
+		geomList[i] = newStaticGeom;
+	}
+	
+	// Copy geometry to GPU.
+	staticGeom* d_geoms = NULL;
+	cudaMalloc((void**)&d_geoms, num_geoms*sizeof(staticGeom));
+	cudaMemcpy( d_geoms, geomList, num_geoms*sizeof(staticGeom), cudaMemcpyHostToDevice);
+	
+	// Copy materials to GPU.
+	material* cudamaterials = NULL;
+	cudaMalloc((void**)&cudamaterials, num_materials*sizeof(material));
+	cudaMemcpy( cudamaterials, materials, num_materials*sizeof(material), cudaMemcpyHostToDevice);
+
+	// Package camera.
+	cameraData cam_data;
+	cam_data.resolution = cam->resolution;
+	cam_data.position = cam->positions[frame];
+	cam_data.view = cam->views[frame];
+	cam_data.up = cam->ups[frame];
+	cam_data.fov = cam->fov;
+
+	// Allocate GPU memory for rays & initialize them.
+	ray* d_rays			 = NULL;
+	glm::vec3* d_lights	 = NULL;
+	bool* d_is_ray_alive = NULL;
+	int* d_ray_idx		 = NULL;
+	cudaMalloc((void**)&d_rays,			num_pixels*sizeof(ray));
+	cudaMalloc((void**)&d_lights,		num_pixels*sizeof(glm::vec3));
+	cudaMalloc((void**)&d_is_ray_alive,	num_pixels*sizeof(bool));
+	cudaMalloc((void**)&d_ray_idx,		num_pixels*sizeof(int));
+	InitRay<<<fullBlocksPerGrid, threadsPerBlock>>>(cam_data, iterations, d_rays, d_lights, d_is_ray_alive, d_ray_idx);
+
+	// Start raytracer kernel.
+	int num_rays = num_pixels;
+	int max_depth = 10; // # of bounces when raytracing.
+	for ( int depth = 0; depth < max_depth; ++depth )
+	{
+		// Determine # of kernels to launch based on # of rays.
+		int num_threads_per_block = 128;
+		int num_blocks_per_grid = ceil((float)num_rays / num_threads_per_block);
+
+		// Update d_rays & d_lights based on intersected object.
+		TraceRay<<<num_blocks_per_grid, num_threads_per_block>>>(iterations, depth, max_depth, num_pixels, d_rays, num_rays, d_lights, d_is_ray_alive, d_ray_idx, d_image, d_geoms, num_geoms, cudamaterials, num_materials);
+		
+		// Update d_rays by removing dead rays (stream compaction).
+		thrust::device_ptr<bool> td_is_ray_alive = thrust::device_pointer_cast(d_is_ray_alive);
+		thrust::device_vector<int> td_v(num_rays);
+		thrust::exclusive_scan(td_is_ray_alive, td_is_ray_alive + num_rays, td_v.begin());
+		
+		// Allocate device memory for storing copy.
+		int num_copy_rays = td_v[num_rays-1] + (int) td_is_ray_alive[num_rays-1];
+		ray* d_rays_copy		  = NULL;
+		glm::vec3* d_lights_copy  = NULL;
+		bool* d_is_ray_alive_copy = NULL;
+		int* d_ray_idx_copy		  = NULL;
+		cudaMalloc((void**)&d_rays_copy,		 num_copy_rays*sizeof(ray));
+		cudaMalloc((void**)&d_lights_copy,		 num_copy_rays*sizeof(glm::vec3));
+		cudaMalloc((void**)&d_is_ray_alive_copy, num_copy_rays*sizeof(bool));
+		cudaMalloc((void**)&d_ray_idx_copy,		 num_copy_rays*sizeof(int));
+		
+		// Only copy living rays.
+		CompactRays<<<num_blocks_per_grid, num_threads_per_block>>>(thrust::raw_pointer_cast(td_v.data()), d_rays, d_lights, d_is_ray_alive, d_ray_idx, num_rays, d_rays_copy, d_lights_copy, d_is_ray_alive_copy, d_ray_idx_copy);
+		cudaDeviceSynchronize();
+		
+		// Free old memory & update pointers to the copies.
+		cudaFree(d_rays);
+		cudaFree(d_lights);
+		cudaFree(d_is_ray_alive);
+		cudaFree(d_ray_idx);
+		num_rays		= num_copy_rays;
+		d_rays			= d_rays_copy;
+		d_lights		= d_lights_copy;
+		d_is_ray_alive	= d_is_ray_alive_copy;
+		d_ray_idx		= d_ray_idx_copy;
+	}
+
+	sendImageToPBO<<<fullBlocksPerGrid, threadsPerBlock>>>(PBOpos, cam->resolution, d_image);
 
-  //retrieve image from GPU
-  cudaMemcpy( renderCam->image, cudaimage, (int)renderCam->resolution.x*(int)renderCam->resolution.y*sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+	// Retrieve image from GPU.
+	cudaMemcpy( cam->image, d_image, num_pixels*sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
-  //free up stuff, or else we'll leak memory like a madman
-  cudaFree( cudaimage );
-  cudaFree( cudageoms );
-  cudaFree( cudamaterials );
-  delete [] geomList;
+	// Free memory.
+	cudaFree( d_image );
+	cudaFree( d_geoms );
+	cudaFree( cudamaterials );
+	cudaFree( d_rays );
+	cudaFree( d_lights );
+	cudaFree( d_is_ray_alive );
+	cudaFree( d_ray_idx );
+	delete [] geomList;
 
-  // make certain the kernel has completed 
-  cudaThreadSynchronize();
+	// Make sure the kernel has completed.
+	cudaDeviceSynchronize();
 
-  checkCUDAError("Kernel failed!");
+	checkCUDAError("Kernel failed!");
 }
diff --git a/trippy.png b/trippy.png
new file mode 100644
index 0000000..a855f3a
Binary files /dev/null and b/trippy.png differ