params.yaml

## This file holds all parameters to be passed into PyCoGAPS.
## To modify default parameters, simply replace parameter values below with user-specified values, and save file. 

# RELATIVE path to data -- make sure to move your data into the created data/ folder
path: data/ModSimData.txt

# result output file name (output saved as a .h5ad file)
result_file: ModSimResult.h5ad

standard_params:
  # number of patterns CoGAPS will learn
  nPatterns: 3
  # number of iterations for each phase of the algorithm
  nIterations: 1000
  # random number generator seed
  seed: 0
  # speeds up performance with sparse data (roughly >80% of data is zero), note this can only be used with the default uncertainty
  useSparseOptimization: False

run_params:
  # maximum number of threads to run on
  nThreads: 1
  # T/F for displaying output
  messages: True
  # number of iterations between each output (set to 0 to disable status updates)
  outputFrequency: 500
  # uncertainty matrix - either a matrix or a supported file type
  uncertainty: null
  # name of the checkpoint file to create
  checkpointOutFile: gaps_checkpoint.out
  # number of iterations between each checkpoint (set to 0 to disable checkpoints)
  checkpointInterval: 250
  # if this is provided, CoGAPS runs from the checkpoint contained in this file
  checkpointInFile: null
  # T/F for transposing data while reading it in - useful for data that is stored as samples x genes since CoGAPS requires data to be genes x samples
  transposeData: False
  # if calling CoGAPS in parallel the worker ID can be specified
  workerID: 1
  # enable asynchronous updating which allows for multi-threaded runs
  asynchronousUpdates: True
  # how many snapshots to take in each phase, setting this to 0 disables snapshots
  nSnapshots: 0
  # which phase to take snapsjots in e.g. "equilibration", "sampling", "all"
  snapshotPhase: sampling

sparsity_params:
  # sparsity parameter for feature matrix
  alphaA: 0.01
  # sparsity parameter for sample matrix
  alphaP: 0.01
  # atomic mass restriction for feature matrix
  maxGibbsMassA: 100
  # atomic mass restriction for sample matrix
  maxGibbsMassP: 100

distributed_params:
  #  either null or genome-wide
  distributed: null
  # number of sets to break data into
  nSets: 4
  # number of branches at which to cut dendrogram used in pattern matching
  # default: nPatterns
  cut: null
  # minimum of individual set contributions a cluster must contain
  # default: math.ceil(cut / 2)
  minNS: null
  # maximum of individual set contributions a cluster can contain
  # default: minNS + nSets
  maxNS: null
  # specify subsets by index or name
  explicitSets: null
  # specify categories along the rows (cols) to use for weighted sampling
  samplingAnnotation: null
  # weights associated with  samplingAnnotation
  samplingWeight: null

additional_params:
  # set of indices to use from the data
  subsetIndices: null
  # which dimension (0=rows, 1=cols) to subset
  subsetDim: 0
  # vector of names of genes in data
  geneNames: null
  # vector of names of samples in data
  sampleNames: null
  # fix either 'A' or 'P' matrix to these values, in the context of distributed CoGAPS, the first phase is skipped and `fixedPatterns: 
  # is used for all sets allowing manual pattern matching, as well as fixed runs of standard CoGAPS
  fixedPatterns: null
  # either 'A' or 'P', indicating which matrix is fixed
  whichMatrixFixed: null
  # whether or not to take PUMP samples
  takePumpSamples: False
  # for reading .h5 files
  hdfKey: null
  # for reading .h5 files
  hdfRowKey: null
  # for reading .h5 files
  hdfColKey: null

aws_params:
  # whether or not to use AWS bucket server
  useAWS: False
  # name of bucket to download from
  downloadBucket: null
  # name of key to download from
  downloadKey: null
  # name of bucket to upload to
  uploadBucket: null
  # name of key to upload to
  uploadKey: null