Merge #51

51: Improvements r=ibacher a=ibacher Usually PRs should be a single change, but here we are... Uses `writedlm()` to generate output rather than the custom implementation we were using. In addition, it adds support for date shifting columns where there are non-date values as well as valid date values. This sometimes happens when the system generating the raw data marks missing data. Co-authored-by: Ian <[email protected]>
bcbi · Oct 23, 2019 · 23a4961 · 23a4961 · ibacher · Nov 4, 2019
2 parents c23f7a2 + 92ae019
commit 23a4961
Show file tree

Hide file tree

Showing 6 changed files with 423 additions and 58 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DeIdentification"
 uuid = "b905b068-7150-5b22-bc23-80596c88c6a6"
 authors = ["Brown Center for Biomedical Informatics"]
-version = "0.7.0"
+version = "0.8.0"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
@@ -11,21 +11,23 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Memento = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9"
+Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 
 [compat]
-julia = "^1.0.0"
 CSV = "^0.5.14"
+Parsers = "^0.3.7"
 DataStructures = "^0.17.2"
 Glob = "^1.2.0"
 JSON = "^0.21.0"
 Memento = "^0.12.1"
 Tables = "^0.2.11"
 YAML = "^0.3.2"
+julia = "^1.0.0"
 
 [extras]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

diff --git a/src/DeIdentification.jl b/src/DeIdentification.jl
@@ -13,6 +13,7 @@ import Random: shuffle, randstring, seed!, make_seed
 import Memento
 import DataStructures: OrderedDict
 import REPL
+import Parsers
 using REPL.TerminalMenus
 using DelimitedFiles
 
@@ -30,7 +31,14 @@ tracking identifier mappings.
 """
 function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
     # Initiate new file
-    infile = CSV.File(fc.filename, dateformat = fc.dateformat)
+    infile = try
+        CSV.File(fc.filename, dateformat = fc.dateformat)
+    catch ArgumentError
+        CSV.File(fc.filename)
+    end
+
+    dicts = DeIdDicts(dicts, fc.dateformat)
+
     outfile = joinpath(pc.outdir, "deid_" * fc.name * "_" * getcurrentdate() * ".csv")
 
     ncol = length(infile.names)
@@ -75,52 +83,60 @@ function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
         writedlm(io, reshape(header, 1, length(header)), ',')
 
         # Process each row
-        for row in infile
-
-            val = getoutput(dicts, Hash, getproperty(row, pcol), 0)
-            pid = setrid(val, dicts)
-
-            for col in infile.names
-                colname = get(fc.rename_cols, col, col)
-
-                action = get(fc.colmap, colname, Missing) ::Type
-                # drop cols
-                action == Drop && continue
-
-                VAL = getproperty(row, col)
-
-                # apply pre-processing transform
-                if haskey(fc.preprocess, colname) && !ismissing(VAL)
-                    transform = fc.preprocess[colname]
-                    transform = replace(transform, "VAL" => "\"$VAL\"")
-                    expr = Meta.parse(transform)
-                    VAL = Core.eval(@__MODULE__, expr)
-                end
-
-                VAL = getoutput(dicts, action, VAL, pid)
-
-                if col == pcol
-                    VAL = pid
+        for (i, row) in Iterators.enumerate(infile)
+            try
+                val = getoutput(dicts, Hash, getproperty(row, pcol), 0)
+                pid = setrid(val, dicts)
+                columns = Vector{String}()
+
+                for col in infile.names
+                    colname = get(fc.rename_cols, col, col)
+
+                    action = get(fc.colmap, colname, Missing) ::Type
+
+                    if action == Drop
+                        continue
+                    end
+
+                    VAL = getproperty(row, col)
+
+                    # apply pre-processing transform
+                    if haskey(fc.preprocess, colname) && !ismissing(VAL)
+                        transform = fc.preprocess[colname]
+                        transform = replace(transform, "VAL" => "\"$VAL\"")
+                        expr = Meta.parse(transform)
+                        VAL = Core.eval(@__MODULE__, expr)
+                    end
+
+                    VAL = getoutput(dicts, action, VAL, pid)
+
+                    if col == pcol
+                        VAL = pid
+                    end
+
+                    # apply post-processing transform
+                    if haskey(fc.postprocess, colname) && !ismissing(VAL)
+                        transform = fc.postprocess[colname]
+                        transform = replace(transform, "VAL" => "\"$VAL\"")
+                        expr = Meta.parse(transform)
+                        VAL = Core.eval(@__MODULE__, expr)
+                    end
+
+                    if eltype(VAL) <: String
+                        VAL = replace(VAL, "\"" => "\\\"")
+                    end
+
+                    if VAL !== nothing && !ismissing(VAL)
+                        push!(columns, string(VAL))
+                    else
+                        push!(columns, "")
+                    end
                 end
 
-                # apply post-processing transform
-                if haskey(fc.postprocess, colname) && !ismissing(VAL)
-                    transform = fc.postprocess[colname]
-                    transform = replace(transform, "VAL" => "\"$VAL\"")
-                    expr = Meta.parse(transform)
-                    VAL = Core.eval(@__MODULE__, expr)
-                end
-
-                if eltype(VAL) <: String
-                    VAL = replace(VAL, "\"" => "\\\"")
-                end
-
-                write(io, "\"$VAL\"")
-                if lastcol == col
-                    write(io, '\n')
-                else
-                    write(io, ",")
-                end
+                writedlm(io, reshape(columns, 1, length(columns)), ',')
+            catch e
+                Memento.error(logger, "$(Dates.now()) Error occurred while processing row $i")
+                rethrow(e)
             end
         end
 
@@ -129,8 +145,6 @@ function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
     return nothing
 end
 
-
-
 """
     deidentify(cfg::ProjectConfig)
 This is the constructor for the `DeIdentified` struct. We use this type to store
@@ -142,7 +156,7 @@ digest of the original primary ID to our new research IDs.
 """
 function deidentify(cfg::ProjectConfig)
     num_files = length(cfg.file_configs)
-    dicts = DeIdDicts(cfg.maxdays, cfg.shiftyears)
+    dicts = DeIdDicts(cfg.maxdays, cfg.shiftyears, cfg.dateformat)
 
     if !isdir(cfg.outdir)
         # mkpath also creates any intermediate paths

diff --git a/src/de_identify.jl b/src/de_identify.jl
@@ -25,7 +25,7 @@ struct FileConfig
     rename_cols::Dict{Symbol,Symbol}
     preprocess::Dict{Symbol, String}
     postprocess::Dict{Symbol, String}
-    dateformat::String
+    dateformat::Dates.DateFormat
 end
 
 
@@ -38,7 +38,7 @@ struct ProjectConfig
     maxdays::Int
     shiftyears::Int
     primary_id::Symbol
-    dateformat::String
+    dateformat::Dates.DateFormat
 end
 
 """
@@ -54,7 +54,7 @@ function ProjectConfig(cfg_file::String)
     num_file = length(cfg["datasets"])
     outdir = cfg["output_path"]
     pk = Symbol(cfg["primary_id"])
-    dateformat = get(cfg, "date_format", "y-m-dTH:M:S.s")
+    dateformat = Dates.DateFormat(get(cfg, "date_format", "y-m-dTH:M:S.s"))
 
     seed = get(_ -> make_seed()[1], cfg, "project_seed")
     maxdays = get(cfg, "max_dateshift_days", 30)
@@ -66,7 +66,12 @@ function ProjectConfig(cfg_file::String)
     # populate File Configs
     for (i, ds) in enumerate(cfg["datasets"])
         name = ds["name"]
-        file_dateformat = get(ds, "date_format", dateformat)
+        if haskey(ds, "date_format")
+            file_dateformat = Dates.DateFormat(get(ds, "date_format", "y-m-dTH:M:S.s"))
+        else
+            file_dateformat = dateformat
+        end
+
         rename_dict = Dict{Symbol,Symbol}()
         for pair in get(ds, "rename_cols", [])
             rename_dict[Symbol(pair["in"])] = Symbol(pair["out"])
@@ -105,17 +110,22 @@ struct DeIdDicts
     dateshift::Dict{Int, Int}
     maxdays::Int
     shiftyears::Int
+    dateformat::Dates.DateFormat
 end
 
 """
-    DeIdDicts(maxdays)
+    DeIdDicts(maxdays, shiftyears, dateformat)
 
 Structure containing dictionaries for project level mappings
 - Primary ID -> Research ID
 - Research ID -> DateShift number of days
 - Research ID -> Salt value
 """
-DeIdDicts(maxdays, shiftyears) = DeIdDicts(Dict{String, Int}(), Dict{Int, String}(), Dict{Int, Int}(), maxdays, shiftyears)
+DeIdDicts(maxdays, shiftyears, dateformat) = DeIdDicts(
+    Dict{String, Int}(), Dict{Int, String}(), Dict{Int, Int}(), maxdays, shiftyears, dateformat)
+
+DeIdDicts(current::DeIdDicts, dateformat::Dates.DateFormat) = DeIdDicts(
+    current.id, current.salt, current.dateshift, current.maxdays, current.shiftyears, dateformat)
 
 
 """
@@ -162,6 +172,22 @@ function dateshift_val!(dicts::DeIdDicts, val::Union{Dates.Date, Dates.DateTime,
 
 end
 
+function dateshift_val!(dicts::DeIdDicts, val::String, pid::Int)
+
+    newval = Parsers.tryparse(Dates.DateTime, val, Parsers.Options(dateformat=dicts.dateformat))
+    if newval === nothing
+        newval = Parsers.tryparse(Dates.Date, val, Parsers.Options(dateformat=dicts.dateformat))
+    end
+
+    if newval === nothing
+        @warn "Could not date shift non-date value $val"
+        return missing
+    end
+
+    return dateshift_val!(dicts, newval, pid)
+
+end
+
 """
     setrid(val, dicts)