diff --git a/jprof.lua b/jprof.lua
index 705c2cf..a874d6a 100644
--- a/jprof.lua
+++ b/jprof.lua
@@ -23,6 +23,7 @@ local profiler = {}
 -- since no allocations/deallocations are triggered by them anymore
 local zoneStack = {nil, nil, nil, nil, nil, nil, nil, nil,
                    nil, nil, nil, nil, nil, nil, nil, nil}
+local profDataNumEvents = 0
 local profData = {}
 local netBuffer = nil
 local profEnabled = true
@@ -31,6 +32,12 @@ local profEnabled = true
 -- to measure the jprof-less (i.e. "real") memory consumption
 local profMem = 0
 
+-- threaded write stuff
+local isThreaded = false
+local numThreads = nil
+local chunkSize = nil
+local eventChannels = {}
+
 local function getByte(n, byte)
     return bit.rshift(bit.band(n, bit.lshift(0xff, 8*byte)), 8*byte)
 end
@@ -39,8 +46,7 @@ end
 -- the file in chunks. If we attempt to pack a big table, the amount of memory
 -- used during packing can exceed the luajit memory limit pretty quickly, which will
 -- terminate the program before the file is written.
-local function msgpackListIntoFile(list, file)
-    local n = #list
+local function msgpackListIntoFile(n, file)
     -- https://github.com/msgpack/msgpack/blob/master/spec.md#array-format-family
     if n < 16 then
         file:write(string.char(144 + n))
@@ -51,15 +57,46 @@ local function msgpackListIntoFile(list, file)
     else
         error("List too big")
     end
-    for _, elem in ipairs(list) do
-        file:write(msgpack.pack(elem))
-    end
+
+	if isThreaded then
+		local DONE = true
+		for _, channel in ipairs(eventChannels) do
+			channel:supply(DONE)
+		end
+
+		local channelIdx = 1
+		-- iterate for each chunk, rounded up to account for the last
+		-- potentially incomplete chunk
+		local numChunks = math.ceil(n/chunkSize)
+		for _ = 1, numChunks do
+			local chunkStr = eventChannels[channelIdx]:demand()
+			file:write(chunkStr)
+			channelIdx = channelIdx % numThreads + 1
+		end
+	else
+		for _, event in ipairs(profData) do
+			file:write(msgpack.pack(event))
+		end
+	end
 end
 
+local profDataNumEventsInChunk = 0
+local currentChannelIndex = 1
 local function addEvent(name, memCount, annot)
-    local event = {name, love.timer.getTime(), memCount, annot}
+	local time = love.timer.getTime()
+	local event = {name, time, memCount, annot}
     if profData then
-        table.insert(profData, event)
+		profDataNumEvents = profDataNumEvents + 1
+		if isThreaded then
+			eventChannels[currentChannelIndex]:push(event)
+			profDataNumEventsInChunk = profDataNumEventsInChunk + 1
+			if profDataNumEventsInChunk == chunkSize then
+				currentChannelIndex = currentChannelIndex % numThreads + 1
+				profDataNumEventsInChunk = 0
+			end
+		else
+			profData[profDataNumEvents] = event
+		end
     end
     if netBuffer then
         table.insert(netBuffer, event)
@@ -84,7 +121,11 @@ if PROF_CAPTURE then
         -- if the full profiling data is not saved to profData, then only netBuffer will increase the
         -- memory used by jprof and all of it will be freed for garbage collection at some point, so that
         -- we should probably not try to keep track of it at all
-        if profData then
+		--
+		-- Ditto for threaded write support: All event storage is squirrelled
+		-- away into worker threads, so we don't actually increase the memory
+		-- toll on the "main" thread VM.
+        if profData and not isThreaded then
             profMem = profMem + (collectgarbage("count") - memCount)
         end
     end
@@ -103,7 +144,7 @@ if PROF_CAPTURE then
         if profiler.socket and #zoneStack == 0 then
             profiler.netFlush()
         end
-        if profData then
+        if profData and not isThreaded then
             profMem = profMem + (collectgarbage("count") - memCount)
         end
     end
@@ -114,17 +155,36 @@ if PROF_CAPTURE then
         end
     end
 
+    function profiler.enableThreadedWrite(_numThreads, _chunkSize)
+        assert(profData, "(jprof) profiling disabled (did you call prof.connect()?))")
+        assert(profDataNumEvents == 0, "(jprof) prof.enableThreadedWrite() should be called before creating profile events")
+		isThreaded = true
+		-- I have no evidence that this is the best number of threads, just that it seems ok on my machine
+		numThreads = _numThreads or love.system.getProcessorCount() * 2
+		-- Ditto here, chunk size does not seem to have a huge effect on performance so long as it's not like, 1
+		chunkSize = _chunkSize or 512
+		for i=1, numThreads do
+			local channel = love.thread.newChannel()
+			table.insert(eventChannels, channel)
+			love.thread.newThread("serializeWorkerThread.lua"):start(channel, chunkSize)
+		end
+	end
+
     function profiler.write(filename)
         assert(#zoneStack == 0, "(jprof) Zone stack is not empty")
 
         if not profData then
             print("(jprof) No profiling data saved (probably because you called prof.connect())")
         else
+            print(("(jprof) Saving %d profiled events..."):format(profDataNumEvents))
+			local serializeTime = love.timer.getTime()
             local file, msg = love.filesystem.newFile(filename, "w")
             assert(file, msg)
-            msgpackListIntoFile(profData, file)
+			file:setBuffer('full')
+            msgpackListIntoFile(profDataNumEvents, file)
             file:close()
-            print(("(jprof) Saved profiling data to '%s'"):format(filename))
+			serializeTime = (love.timer.getTime() - serializeTime)
+            print(("(jprof) Saved profiling data to '%s' (%f seconds)"):format(filename, serializeTime))
         end
     end
 
@@ -191,6 +251,7 @@ else
     profiler.push = noop
     profiler.pop = noop
     profiler.write = noop
+    profiler.enableThreadedWrite = noop
     profiler.enabled = noop
     profiler.connect = noop
     profiler.netFlush = noop
diff --git a/serializeWorkerThread.lua b/serializeWorkerThread.lua
new file mode 100644
index 0000000..cfb64ca
--- /dev/null
+++ b/serializeWorkerThread.lua
@@ -0,0 +1,42 @@
+local eventChannel, chunkSize = ...
+
+love.filesystem = require 'love.filesystem'
+local msgpack = require("MessagePack")
+msgpack.set_number("double")
+
+local eventList = {}
+
+-- record events
+local complete = false
+while not complete do
+	local event = eventChannel:demand()
+	if event == true then
+		complete = true
+	else
+		table.insert(eventList, event)
+	end
+end
+
+-- serialize events
+local buf = {}
+local function pushBuf()
+	local str = table.concat(buf)
+	eventChannel:push(str)
+	for i=#buf, 1, -1 do
+		buf[i] = nil
+	end
+end
+
+for _, event in ipairs(eventList) do
+	local str = msgpack.pack(event)
+	table.insert(buf, str)
+	if #buf == chunkSize then
+		pushBuf()
+	end
+end
+
+if #buf ~= 0 then
+	-- push final incomplete chunk
+	-- there should only be one worker that actually runs this
+	pushBuf()
+end