Skip to content

Commit

Permalink
Merge #51
Browse files Browse the repository at this point in the history
51: Improvements r=ibacher a=ibacher

Usually PRs should be a single change, but here we are...

Uses `writedlm()` to generate output rather than the custom implementation we were using. In addition, it adds support for date shifting columns where there are non-date values as well as valid date values. This sometimes happens when the system generating the raw data marks missing data.

Co-authored-by: Ian <[email protected]>
  • Loading branch information
bcbi-bot and ibacher committed Oct 23, 2019
2 parents c23f7a2 + 92ae019 commit 23a4961
Show file tree
Hide file tree
Showing 6 changed files with 423 additions and 58 deletions.
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "DeIdentification"
uuid = "b905b068-7150-5b22-bc23-80596c88c6a6"
authors = ["Brown Center for Biomedical Informatics"]
version = "0.7.0"
version = "0.8.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand All @@ -11,21 +11,23 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Memento = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9"
Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"

[compat]
julia = "^1.0.0"
CSV = "^0.5.14"
Parsers = "^0.3.7"
DataStructures = "^0.17.2"
Glob = "^1.2.0"
JSON = "^0.21.0"
Memento = "^0.12.1"
Tables = "^0.2.11"
YAML = "^0.3.2"
julia = "^1.0.0"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand Down
110 changes: 62 additions & 48 deletions src/DeIdentification.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import Random: shuffle, randstring, seed!, make_seed
import Memento
import DataStructures: OrderedDict
import REPL
import Parsers
using REPL.TerminalMenus
using DelimitedFiles

Expand All @@ -30,7 +31,14 @@ tracking identifier mappings.
"""
function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
# Initiate new file
infile = CSV.File(fc.filename, dateformat = fc.dateformat)
infile = try
CSV.File(fc.filename, dateformat = fc.dateformat)
catch ArgumentError
CSV.File(fc.filename)
end

dicts = DeIdDicts(dicts, fc.dateformat)

outfile = joinpath(pc.outdir, "deid_" * fc.name * "_" * getcurrentdate() * ".csv")

ncol = length(infile.names)
Expand Down Expand Up @@ -75,52 +83,60 @@ function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
writedlm(io, reshape(header, 1, length(header)), ',')

# Process each row
for row in infile

val = getoutput(dicts, Hash, getproperty(row, pcol), 0)
pid = setrid(val, dicts)

for col in infile.names
colname = get(fc.rename_cols, col, col)

action = get(fc.colmap, colname, Missing) ::Type
# drop cols
action == Drop && continue

VAL = getproperty(row, col)

# apply pre-processing transform
if haskey(fc.preprocess, colname) && !ismissing(VAL)
transform = fc.preprocess[colname]
transform = replace(transform, "VAL" => "\"$VAL\"")
expr = Meta.parse(transform)
VAL = Core.eval(@__MODULE__, expr)
end

VAL = getoutput(dicts, action, VAL, pid)

if col == pcol
VAL = pid
for (i, row) in Iterators.enumerate(infile)
try
val = getoutput(dicts, Hash, getproperty(row, pcol), 0)
pid = setrid(val, dicts)
columns = Vector{String}()

for col in infile.names
colname = get(fc.rename_cols, col, col)

action = get(fc.colmap, colname, Missing) ::Type

if action == Drop
continue
end

VAL = getproperty(row, col)

# apply pre-processing transform
if haskey(fc.preprocess, colname) && !ismissing(VAL)
transform = fc.preprocess[colname]
transform = replace(transform, "VAL" => "\"$VAL\"")
expr = Meta.parse(transform)
VAL = Core.eval(@__MODULE__, expr)
end

VAL = getoutput(dicts, action, VAL, pid)

if col == pcol
VAL = pid
end

# apply post-processing transform
if haskey(fc.postprocess, colname) && !ismissing(VAL)
transform = fc.postprocess[colname]
transform = replace(transform, "VAL" => "\"$VAL\"")
expr = Meta.parse(transform)
VAL = Core.eval(@__MODULE__, expr)
end

if eltype(VAL) <: String
VAL = replace(VAL, "\"" => "\\\"")
end

if VAL !== nothing && !ismissing(VAL)
push!(columns, string(VAL))
else
push!(columns, "")
end
end

# apply post-processing transform
if haskey(fc.postprocess, colname) && !ismissing(VAL)
transform = fc.postprocess[colname]
transform = replace(transform, "VAL" => "\"$VAL\"")
expr = Meta.parse(transform)
VAL = Core.eval(@__MODULE__, expr)
end

if eltype(VAL) <: String
VAL = replace(VAL, "\"" => "\\\"")
end

write(io, "\"$VAL\"")
if lastcol == col
write(io, '\n')
else
write(io, ",")
end
writedlm(io, reshape(columns, 1, length(columns)), ',')
catch e
Memento.error(logger, "$(Dates.now()) Error occurred while processing row $i")
rethrow(e)
end
end

Expand All @@ -129,8 +145,6 @@ function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
return nothing
end



"""
deidentify(cfg::ProjectConfig)
This is the constructor for the `DeIdentified` struct. We use this type to store
Expand All @@ -142,7 +156,7 @@ digest of the original primary ID to our new research IDs.
"""
function deidentify(cfg::ProjectConfig)
num_files = length(cfg.file_configs)
dicts = DeIdDicts(cfg.maxdays, cfg.shiftyears)
dicts = DeIdDicts(cfg.maxdays, cfg.shiftyears, cfg.dateformat)

if !isdir(cfg.outdir)
# mkpath also creates any intermediate paths
Expand Down
38 changes: 32 additions & 6 deletions src/de_identify.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct FileConfig
rename_cols::Dict{Symbol,Symbol}
preprocess::Dict{Symbol, String}
postprocess::Dict{Symbol, String}
dateformat::String
dateformat::Dates.DateFormat
end


Expand All @@ -38,7 +38,7 @@ struct ProjectConfig
maxdays::Int
shiftyears::Int
primary_id::Symbol
dateformat::String
dateformat::Dates.DateFormat
end

"""
Expand All @@ -54,7 +54,7 @@ function ProjectConfig(cfg_file::String)
num_file = length(cfg["datasets"])
outdir = cfg["output_path"]
pk = Symbol(cfg["primary_id"])
dateformat = get(cfg, "date_format", "y-m-dTH:M:S.s")
dateformat = Dates.DateFormat(get(cfg, "date_format", "y-m-dTH:M:S.s"))

seed = get(_ -> make_seed()[1], cfg, "project_seed")
maxdays = get(cfg, "max_dateshift_days", 30)
Expand All @@ -66,7 +66,12 @@ function ProjectConfig(cfg_file::String)
# populate File Configs
for (i, ds) in enumerate(cfg["datasets"])
name = ds["name"]
file_dateformat = get(ds, "date_format", dateformat)
if haskey(ds, "date_format")
file_dateformat = Dates.DateFormat(get(ds, "date_format", "y-m-dTH:M:S.s"))
else
file_dateformat = dateformat
end

rename_dict = Dict{Symbol,Symbol}()
for pair in get(ds, "rename_cols", [])
rename_dict[Symbol(pair["in"])] = Symbol(pair["out"])
Expand Down Expand Up @@ -105,17 +110,22 @@ struct DeIdDicts
dateshift::Dict{Int, Int}
maxdays::Int
shiftyears::Int
dateformat::Dates.DateFormat
end

"""
DeIdDicts(maxdays)
DeIdDicts(maxdays, shiftyears, dateformat)
Structure containing dictionaries for project level mappings
- Primary ID -> Research ID
- Research ID -> DateShift number of days
- Research ID -> Salt value
"""
DeIdDicts(maxdays, shiftyears) = DeIdDicts(Dict{String, Int}(), Dict{Int, String}(), Dict{Int, Int}(), maxdays, shiftyears)
DeIdDicts(maxdays, shiftyears, dateformat) = DeIdDicts(
Dict{String, Int}(), Dict{Int, String}(), Dict{Int, Int}(), maxdays, shiftyears, dateformat)

DeIdDicts(current::DeIdDicts, dateformat::Dates.DateFormat) = DeIdDicts(
current.id, current.salt, current.dateshift, current.maxdays, current.shiftyears, dateformat)


"""
Expand Down Expand Up @@ -162,6 +172,22 @@ function dateshift_val!(dicts::DeIdDicts, val::Union{Dates.Date, Dates.DateTime,

end

function dateshift_val!(dicts::DeIdDicts, val::String, pid::Int)

newval = Parsers.tryparse(Dates.DateTime, val, Parsers.Options(dateformat=dicts.dateformat))
if newval === nothing
newval = Parsers.tryparse(Dates.Date, val, Parsers.Options(dateformat=dicts.dateformat))
end

if newval === nothing
@warn "Could not date shift non-date value $val"
return missing
end

return dateshift_val!(dicts, newval, pid)

end

"""
setrid(val, dicts)
Expand Down
Loading

2 comments on commit 23a4961

@ibacher
Copy link
Contributor

@ibacher ibacher commented on 23a4961 Nov 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/5043

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if Julia TagBot is installed, or can be done manually through the github interface, or via:

git tag -a v0.8.0 -m "<description of version>" 23a496181f1a46ee100d1ba7073a384ac5dfaee0
git push origin v0.8.0

Please sign in to comment.