Skip to content

Commit

Permalink
fix relatedness calculation for consanguinity and double relations
Browse files Browse the repository at this point in the history
  • Loading branch information
brentp committed Jul 3, 2019
1 parent 3f3d689 commit a50d730
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 204 deletions.
2 changes: 1 addition & 1 deletion bpbio.nimble
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Package
version = "0.1.5"
version = "0.1.7"
author = "Brent Pedersen"
description = "collection of command-line tools and small utility functions for genomics"
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion src/bpbiopkg/info.nim
Original file line number Diff line number Diff line change
@@ -1 +1 @@
var version* = "0.1.6"
var version* = "0.1.7"
314 changes: 112 additions & 202 deletions src/bpbiopkg/pedfile.nim
Original file line number Diff line number Diff line change
Expand Up @@ -43,180 +43,103 @@ proc spouse*(s:Sample): Sample {.inline.} =
if k.dad == s: return k.mom
return k.dad

proc addAncestors(q: var Deque[Sample], o:Sample) =
if o.mom != nil:
q.addLast(o.mom)
if o.dad != nil:
q.addLast(o.dad)
if o.mom != nil:
q.addAncestors(o.mom)
if o.dad != nil:
q.addAncestors(o.dad)

proc successors(o:Sample, samples: var seq[Sample]) =
## add kids and kids of kids
for kid in o.kids:
samples.add(kid)
for kid in o.kids:
successors(kid, samples)

proc successors(o:Sample): seq[Sample] =
successors(o, result)

proc ancestors(o:Sample, samples:var seq[Sample]) =
if o.mom != nil:
samples.add(o.mom)
if o.dad != nil:
samples.add(o.dad)
if o.mom != nil:
ancestors(o.mom, samples)
if o.dad != nil:
ancestors(o.dad, samples)

proc ancestors(o:Sample): seq[Sample] =
ancestors(o, result)

proc hash*(s:Sample): Hash =
var h: Hash = 0

h = h !& hash(s.family_id)
h = h !& hash(s.id)
result = !$h

proc lowest_common_ancestors(samples:seq[Sample], a:Sample, b:Sample): seq[Sample] =
var all_ancestors = initSet[Sample](8)
var common_ancestors = initSet[Sample](2)
for i, target in @[a, b]:
var parents = initSet[Sample](8)
var q = initDeque[Sample]()

q.addFirst(target)
while q.len > 0:
var n = q.popFirst
if parents.contains(n): continue
# A predecessor of n is a node m such that there exists a directed edge from m to n.
# A successor of n is a node m such that there exists a directed edge from n to m.
#
# TODO: might need to flip addKids and ancestors.
q.addAncestors(n)
parents.incl(n)

if i == 0:
common_ancestors.incl(parents)
else:
common_ancestors = common_ancestors.intersection(parents)
all_ancestors.incl(parents)


for p in common_ancestors:
for child in p.successors:
if not common_ancestors.contains(child) and child in all_ancestors:
result.add(p)
break

type dsample = object
s: Sample
dist: int

proc dijkstra(samples:seq[Sample], a:Sample, b:Sample): int =
## return the shortest path from a to b. in this case a must be
## an predecessor (older than) b.
## see: https://github.com/mburst/dijkstras-algorithm/blob/master/dijkstras.py
var dsamples = newSeqOfCap[dsample](4)
var previous = initTable[Sample, Sample](4)
if a.family_id != b.family_id: return -1

var h = newHeap[dsample]() do (a, b: dsample) -> int:
return a.dist - b.dist

for v in samples:
if v.family_id != a.family_id: continue
if v == a: dsamples.add(dsample(s:v, dist:0))
else: dsamples.add(dsample(s:v, dist:samples.len + 2))
h.push(dsamples[dsamples.high])

while h.size > 0:
var dsmallest = h.pop()
var smallest = dsmallest.s
if smallest == b:
if not previous.contains(smallest): return -1
while previous.contains(smallest):
result += 1
smallest = previous[smallest]
return

if dsmallest.dist == samples.len + 2: break

for okid in dsmallest.s.kids:
# TODO: make this more efficient.
var kid:dsample
for s in dsamples:
if s.s == okid:
kid = s
break
var alt = dsmallest.dist + 1
if alt < kid.dist:
# TODO: make sure kid with updated dist gets into the heap
kid.dist = alt
previous[okid] = dsmallest.s

var heapSamples = toSeq(h.items)

h = newHeap[dsample](proc(a, b: dsample): int =
return a.dist - b.dist
)
for s in heapSamples:
if s.s.id == okid.id:
h.push(kid)
else:
h.push(s)
return -1


proc relatedness*(a:Sample, b:Sample, samples: seq[Sample]): float64 =
## coefficient of relatedness of 2 samples given their family.
## to differentiates siblings from parent-child relationships,
## siblings are return with a value of 0.49
## samples from different families are given a value of -1.
if a.family_id != b.family_id: return -1
if a.dad == b or a.mom == b: return 0.5
if b.dad == a or b.mom == a: return 0.5

# sibs get a special-case of 0.49 to differentiate from parent-child
if a.dad != nil and a.dad == b.dad and a.mom != nil and a.mom == b.mom:
return 0.49

if a notin samples: return -1'f64
if b notin samples: return -1'f64

var lca = lowest_common_ancestors(samples, b, a)
if lca.len == 0: return 0
var
amax: int = 1000 # hacky use of 1000 and empty value.
bmax: int = 1000
n = 0

for anc in lca:
if anc != a:
var d = dijkstra(samples, anc, a)
if d > 0:
amax = min(amax, d)
if anc != b:
var d = dijkstra(samples, anc, b)
if d > 0:
bmax = min(bmax, dijkstra(samples, anc, b))

# if one of the samples is in the set, then their distance is 1.
if a in lca:
amax = 1
if b in lca:
bmax = 1

if amax != 1000: n += 1 else: amax = 0
if bmax != 1000: n += 1 else: bmax = 0

# subtract 2 because the path includes the end sample which we don't need.
return n.float64 * pow(2'f64, -float64(amax + bmax))

type sample_path = tuple[sample:Sample, a_path:seq[Sample], b_path:seq[Sample]]

proc LCA(a:Sample, b:Sample, lcas: var HashSet[sample_path], a_path:seq[Sample], b_path:seq[Sample]) =

var a_path = a_path
var b_path = b_path

if a == nil: return
if b == nil: return

if a == b:
lcas.incl((a, a_path, b_path))

block:
var a_path = a_path & a.mom
LCA(a.mom, b, lcas, a_path, b_path)
block:
var a_path = a_path & a.dad
LCA(a.dad, b, lcas, a_path, b_path)
block:
var b_path = b_path & b.mom
LCA(a, b.mom, lcas, a_path, b_path)
block:
var b_path = b_path & b.dad
LCA(a, b.dad, lcas, a_path, b_path)

proc double_use(a:seq[Sample], b:seq[Sample]): bool {.inline.} =
# a is a chain of samples up to common ancestor
# b is chain of samples up to common ancestor
# if a and b have the same tails, then we have double-use
# and we don't count this chain
# http://genetic-genealogy.co.uk/Toc115570135.html
var ai = a.high
var bi = b.high
doAssert a[ai] == b[bi]
ai -= 1
bi -= 1
if ai < 0 or bi < 0: return false

# a:@[Sample(id:8451, i:-1, affected:false), Sample(id:8448, i:-1, affected:false)]
# b:@[Sample(id:8683, i:-1, affected:false), Sample(id:8451, i:-1, affected:false), Sample(id:8448, i:-1, affected:false)]

# go backwards through the chains
result = false
while ai > 0 or bi > 0:
#echo "testing:", ai, ",", bi
if a[ai] != b[bi]:
#echo "bailing. ai:", ai, " bi:", bi
return false
return true
#result = true
#ai -= 1
#bi -= 1
#if ai <= 0 or bi <= 0: break

when defined(superdebug):
echo "ai:", ai, " bi:", bi, "dup?", result

proc is_full_sib_with(a:Sample, b:Sample): bool {.inline.} =
return a.dad != nil and b.dad != nil and a.dad == b.dad and a.mom != nil and b.mom != nil and a.mom == b.mom

proc relatedness*(a:Sample, b:Sample): float =
if a.family_id != b.family_id: return -1.0
var r = initHashSet[sample_path](8)
LCA(a, b, r, @[a], @[b])
if len(r) == 0: return 0.0

# now get a seq and sort by shortest totlal path-length
var rs = newSeq[sample_path]()
for x in r:
when defined(superdebug):
echo ">>>>>>>>>>>>>>>>>>>>>>> "
echo "a:", x.a_path
echo "b:", x.b_path
if double_use(x.a_path, x.b_path):
when defined(superdebug):
echo "double"
echo "<<<<<<<<<<<<<<<<<<<<"
continue
rs.add(x)
when defined(superdebug):
echo "not double"
echo "<<<<<<<<<<<<<<<<<<<<"
#echo " "
for r in rs:
result += pow(2'f64, -float64(r.a_path.len - 1 + r.b_path.len - 1))#*pow(1 + prel, 0.5)

if result == 0.5 and a.is_full_sib_with(b):
result = 0.49

proc parse_ped*(path: string, verbose:bool=true): seq[Sample] =
result = new_seq_of_cap[Sample](10)
Expand Down Expand Up @@ -315,6 +238,7 @@ when isMainModule:

suite "pedfile test suite":

#[
test "that samples match those in vcf":
var osamples = samples.match(ovcf)
for i, s in osamples:
Expand All @@ -334,20 +258,7 @@ when isMainModule:
check sib.dad == samples[0].dad
check sib.mom == samples[0].mom

test "lowest common ancestor":
var k1 = Sample(family_id:"1", id:"kid1")
var k2 = Sample(family_id:"1", id:"kid2")
var mom = Sample(family_id:"1", id:"mom")
var dad = Sample(family_id:"1", id:"dad")
k1.mom = mom
k2.mom = mom
k1.dad = dad
k2.dad = dad
mom.kids.add(@[k1, k2])
dad.kids.add(@[k1, k2])

echo lowest_common_ancestors(@[k1, k2, mom, dad], k1, k2)
]#

test "dijkstra and relatedness":
var k1 = Sample(family_id:"1", id:"kid1")
Expand All @@ -357,39 +268,37 @@ when isMainModule:
var uncle = Sample(family_id: "1", id:"uncle")
var cousin = Sample(family_id: "1", id:"cousin")
var gma = Sample(family_id:"1", id:"gma")
var gpa = Sample(family_id:"1", id:"gma")
var ggma = Sample(family_id:"1", id:"ggma")
var ggpa = Sample(family_id:"1", id:"ggpa")
var unrel = Sample(family_id:"1", id:"un")
var extern = Sample(family_id:"xxx", id:"extern")
k1.mom = mom
k2.mom = mom
k1.dad = dad
k2.dad = dad
dad.mom = gma
dad.dad = gpa
uncle.mom = gma
uncle.dad = gpa
uncle.kids.add(cousin)
cousin.dad = uncle
gma.kids.add(@[dad, uncle])
mom.kids.add(@[k1, k2])
dad.kids.add(@[k1, k2])
ggma.kids.add(gma)
gma.mom = ggma
var fam = @[k1, k2, mom, dad, gma, ggma, cousin, uncle, unrel]
check 2 == dijkstra(fam, gma, k1)
check 1 == dijkstra(fam, mom, k1)
check 3 == dijkstra(fam, ggma, k1)
check 1 == dijkstra(fam, ggma, gma)
check 2 == dijkstra(fam, ggma, dad)
check -1 == dijkstra(fam, ggma, mom)

check relatedness(uncle, k1, fam) == 0.25
check relatedness(dad, k1, fam) == 0.5
check relatedness(k1, k2, fam) == 0.49 # ^^^^^
check relatedness(k2, mom, fam) == 0.5
check relatedness(k2, gma, fam) == 0.25
check relatedness(k2, ggma, fam) == 0.125
check relatedness(extern, gma, fam) == -1'f64
check relatedness(unrel, gma, fam) == 0.0'f64
check relatedness(k1, cousin, fam) == 0.125
gma.dad = ggpa

check relatedness(uncle, k1) == 0.25
check relatedness(dad, k1) == 0.5
check relatedness(k1, k2) == 0.49 # ^^^^^
check relatedness(k2, mom) == 0.5
check relatedness(k2, gma) == 0.25
check relatedness(k2, ggma) == 0.125
check relatedness(extern, gma) == -1'f64
check relatedness(unrel, gma) == 0.0'f64
check relatedness(k1, cousin) == 0.125

test "relatedness":
var k1 = Sample(family_id:"1", id:"kid1")
Expand All @@ -398,14 +307,15 @@ when isMainModule:
dad.kids.add(k1)
var fam = @[k1, dad]

check 0.5 == relatedness(k1, dad, fam)
check 0.5 == relatedness(k1, dad)

test "relatedness with no relations":
echo "xxx"
var a = Sample(family_id:"1", id:"a")
var b = Sample(family_id:"2", id:"b")
check relatedness(a, b, @[a, b]) == -1'f64
check relatedness(a, b) == -1'f64
b.family_id = "1"
check relatedness(a, b, @[a, b]) == -0'f64
check relatedness(a, b) == -0'f64


test "relatedness with 603 ceph samples":
Expand All @@ -415,7 +325,7 @@ when isMainModule:

for i, sampleA in samples[0..<samples.high]:
for j, sampleB in samples[i + 1..samples.high]:
var rel = relatedness(sampleA, sampleB, samples)
var rel = relatedness(sampleA, sampleB)
doAssert -1'f64 <= rel
if rel > 0.5:
echo &"{$sampleA} {$sampleB} {$rel}"
Expand Down
Loading

0 comments on commit a50d730

Please sign in to comment.