Merge pull request #1 from humandx/feat/download-and-extract-NAMCS

Added NAMCS extractor and converter for public dataset
humandx · Jun 15, 2018 · 3e68a51 · 3e68a51
2 parents df685e4 + 0ff8980
commit 3e68a51
Show file tree

Hide file tree

Showing 40 changed files with 6,628 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,23 @@
+# This file contains directory and files that needs to ignored.
+build/
+dist/
+namcs.egg-info/
+namcs/data/
+namcs/namcs/__pycache__/
+namcs/utils/__pycache__/
+test/
+all_namcs_data.tsv
+namcs/__pycache__/
+namcs/general/__pycache__/
+namcs/helpers/__pycache__/
+namcs/mapper/__pycache__/
+namcs/scripts/__pycache__/
+namcs_test.py
+hdx_ahcd/data/
+hdx_ahcd/__pycache__/
+hdx_ahcd/general/__pycache__/
+hdx_ahcd/helpers/__pycache__/
+hdx_ahcd/mapper/__pycache__/
+hdx_ahcd/namcs/__pycache__/
+hdx_ahcd/scripts/__pycache__/
+hdx_ahcd/utils/__pycache__/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1 @@
+To be decided.
diff --git a/README.md b/README.md
@@ -1,2 +1,121 @@
-# hdx-data-extraction-ahcd
-Code to parse and clean the CDC's Ambulatory Health Care Data (AHCD) (NAMCS and NHAMCS): https://www.cdc.gov/nchs/ahcd/about_ahcd.htm
+# HDX-data-extraction-AHCD
+------
+Code to parse and clean the CDC's Ambulatory Health Care Data (AHCD) (NAMCS and NHAMCS): https://www.cdc.gov/nchs/ahcd/about_ahcd.htm.
+  - # NAMCS
+    The National Ambulatory Medical Care Survey (NAMCS) is a national survey designed to meet the need for objective, reliable information about the provision and use of ambulatory medical care services in the United States. Findings are based on a sample of visits to nonfederally employed office-based physicians who are primarily engaged in direct patient care.
+  - # NHAMCS
+    The National Hospital Ambulatory Medical Care Survey (NHAMCS) is designed to collect data on the utilization and provision of ambulatory care services in hospital emergency and outpatient departments, and in ambulatory surgery centers
+
+# Code Structure
+>   **hdx_ahcd** serves as base directory
+```sh
+hdx_ahcd
+├── general
+│   ├── __init__.py
+│   ├── namcs_converter.py
+│   └── namcs_extractor.py
+├── helpers
+│   ├── functions.py
+│   └── __init__.py
+├── mapper
+│   ├── functions.py
+│   ├── __init__.py
+│   └── years.py
+├── namcs
+│   ├── config.py
+│   ├── constants.py
+│   ├── enums.py
+│   └── __init__.py
+├── scripts
+│   ├── controllers.py
+│   ├── __init__.py
+│   └── validation.py
+└── utils
+    ├── context.py
+    ├── decorators.py
+    ├── exceptions.py
+    ├── __init__.py
+    └── utils.py
+```
+* general
+    - namcs_extractor.py - download and extract public NAMCS data
+    -  namcs_converter.py - process and convert NAMCS data in human readable format
+* helpers - various methods for manipulating dataset and it's details
+* mappers
+    - helpers - methods to translate raw data from dataset to human readable format
+    - years - year wise NAMCS details like fields, their position in dataset, length etc.
+* namcs - contains configurable parameters and constants
+* scripts
+    - controllers - provide common entry point for execution
+    - validation - validation of dataset and parameters provided while invoking script controllers
+* utils - contains useful decorators, context managers etc.
+* namcs_test.py - script to perfrom regression for all namcs year(DEV purpose only).
+
+### Installation
+-----
+Currently supported python version 3.6.x,
+To check python version
+```sh
+python --version
+```
+Ensure pip, setuptools, and wheel are up to date
+```sh
+python -m pip install --upgrade pip setuptools wheel
+```
+If you have local copy of this repo and want to install directly from it.
+```sh
+pip install ${PATH_FOR_HDX-data-extraction-AHCD_REPO}
+```
+Similarly you can execute setup file
+```sh
+python3 ${PATH_FOR_HDX-data-extraction-AHCD_REPO}/setup.py install
+```
+for example:
+```sh
+pip install /var/tmp/HDX-data-extraction-AHCD/
+```
+and
+for example:
+```sh
+python3 /var/tmp/HDX-data-extraction-AHCD/setup.py install
+```
+You can also use pip directly for Installation.
+```sh
+pip install hdx_ahcd
+```
+-----
+### Usage
+-----
+```sh
+>>> import hdx_ahcd
+>>> from hdx_ahcd import get_cleaned_data_by_year
+```
+### Uninstall
+-----
+To uninstall you can use either
+```sh
+easy_install -m hdx_ahcd
+```
+or
+```sh
+pip uninstall hdx_ahcd
+```
+### TODO
+-----
+- Support for all years
+    - supported years are 1973, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1985, 1989, 1990, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015
+ - Support for more fields
+    - supported fields are
+        - date_of_visit
+        - date_of_birth
+        - year_of_visit
+        - year_of_birth
+        - month_of_visit
+        - month_of_birth
+        - patient_age
+        - gender
+        - physician_diagnosis
+ ---
+License
+----
+To be discussed.
diff --git a/__init__.py b/__init__.py
diff --git a/hdx_ahcd/__init__.py b/hdx_ahcd/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+This file contains parameters related to initialization of package
+"""
+# Other modules
+from hdx_ahcd.scripts.controllers import NAMCSController as __Controller
+
+# 3rd party modules
+# -N/A
+name = "hdx_ahcd"
+get_cleaned_data_by_year = __Controller().execute
diff --git a/hdx_ahcd/general/__init__.py b/hdx_ahcd/general/__init__.py
diff --git a/hdx_ahcd/general/namcs_converter.py b/hdx_ahcd/general/namcs_converter.py
@@ -0,0 +1,214 @@
+# -*- coding: utf-8 -*-
+"""
+This file translates raw NAMCS patient case data data into converted csv file.
+"""
+# Python modules
+import csv
+import os
+from collections import defaultdict
+
+# Other modules
+from hdx_ahcd.helpers.functions import (
+    get_customized_file_name,
+    get_field_code_from_record,
+    get_iterable,
+    get_namcs_dataset_path_for_year,
+    get_normalized_namcs_file_name,
+    get_namcs_source_file_info,
+    process_multiple_slice_objects,
+    populate_missing_fields,
+    safe_read_file
+)
+from hdx_ahcd.mapper import years
+from hdx_ahcd.namcs.config import (
+    CONVERTED_CSV_FIELDS,
+    CONVERTED_CSV_FILE_NAME_SUFFIX,
+    ERROR_FILES_DIR_PATH,
+    NAMCS_DATA_DIR_PATH,
+    log,
+    YEARS_AVAILABLE)
+from hdx_ahcd.namcs.enums import NAMCSFieldEnum
+from hdx_ahcd.utils.context import try_except
+from hdx_ahcd.utils.decorators import (
+    create_path_if_does_not_exists
+)
+from hdx_ahcd.utils.utils import detailed_exception_info
+
+# 3rd party modules
+# -N/A
+
+# Global vars
+# -N/A
+
+
+@create_path_if_does_not_exists(ERROR_FILES_DIR_PATH)
+def get_generator_by_year(year, namcs_raw_dataset_file=None):
+    """
+    Method to translate raw NAMCS patient case data data for given year.
+
+    Parameters:
+        year (:class:`int`): NAMCS year for which raw data needs to be
+            translated.
+        namcs_raw_dataset_file (:class:`str`): File path for
+            raw dataset input file.
+
+    Returns:
+        :class:`generator` : Generator object containing converted
+            raw NAMCS patient case data data for given year.
+    """
+    year_class_object = vars(years).get("Year{}".format(year))
+    dataset_file = namcs_raw_dataset_file if namcs_raw_dataset_file else \
+        get_namcs_dataset_path_for_year(year)
+
+    # Calculating `SOURCE_FILE_ID`
+    source_file_id = get_normalized_namcs_file_name(year)
+
+    # Error file name
+    error_file = os.path.join(
+        ERROR_FILES_DIR_PATH, get_customized_file_name(source_file_id,
+                                                       extension = "err")
+    )
+
+    # Removing existing error file
+    if os.path.exists(error_file):
+        with try_except():
+            os.remove(error_file)
+
+    # Error file name headers
+    error_file_headers = ("record_no", "exception", "record")
+
+    errors = []
+
+    field_mappings = year_class_object.get_field_slice_mapping()
+
+    with open(dataset_file, "r") as dataset_file_handler:
+        for line_no, line in safe_read_file(dataset_file_handler):
+            write_line = {}
+            try:
+                write_line[NAMCSFieldEnum.SOURCE_FILE_ID.value] = \
+                    source_file_id
+                write_line[NAMCSFieldEnum.SOURCE_FILE_ROW.value] = \
+                    line_no + 1
+                for field_name, slice_object in field_mappings.items():
+                    # If slice_object is tuple evaluating all items at
+                    # last to club all the results under one `field_name`.
+                    if isinstance(slice_object, (list, tuple)):
+                        code = process_multiple_slice_objects(
+                            line, field_name, slice_object
+                        )
+                    else:
+                        code = get_field_code_from_record(
+                            line, field_name, slice_object
+                        )
+
+                    write_line[field_name] = ",".join(code) if isinstance(
+                        code, (tuple, list)
+                    ) else code
+
+                # Call to method `populate_missing_fields` to calculate
+                # all missing fields.
+                write_line = \
+                    populate_missing_fields(CONVERTED_CSV_FIELDS,
+                                            write_line)
+            except Exception as exc:
+                detailed_exception_info(logger=log)
+                errors.append(
+                    {
+                        "record_no": line_no + 1,
+                        "record": line,
+                        "exception": str(exc)
+                    }
+                )
+            yield write_line
+        if errors:
+            # TODO: discard record or replace None value for erroneous field
+            with open(error_file, "w") as error_file_handler:
+                writer = csv.DictWriter(error_file_handler,
+                                        delimiter = ',',
+                                        fieldnames = error_file_headers)
+                writer.writeheader()
+                for _error in errors:
+                    writer.writerow(_error)
+                log.info("\n******\n"
+                         "Finished writing to error file:{}".format(error_file))
+
+
+def export_to_csv(year, generator_object):
+    """
+    Method to dump the converted raw NAMCS patient case data data
+    into CSV file as defined by `CONVERTED_CSV_FILE_NAME_SUFFIX`
+    for given year.
+
+    Parameters:
+        year (:class:`int` or :class:`str`): NAMCS year for which raw
+            data needs to be translated.
+        generator_object (:class:`generator`): Generator object containing
+            converted raw NAMCS patient case data data for given year.
+
+    Returns:
+        :class:`str` : File path for `CONVERTED_CSV_FILE_NAME_SUFFIX`.
+    """
+    # Calculating `SOURCE_FILE_ID`
+    source_file_id = get_normalized_namcs_file_name(year)
+
+    # Output csv file converting the initial dataset into mapped data
+    converted_csv_file = os.path.join(
+        NAMCS_DATA_DIR_PATH, get_customized_file_name(
+            source_file_id, CONVERTED_CSV_FILE_NAME_SUFFIX, extension = "csv"
+        )
+    )
+
+    with try_except():
+        with open(converted_csv_file, 'w') as csv_file:
+            writer = csv.DictWriter(csv_file,
+                                    delimiter = ',',
+                                    fieldnames = CONVERTED_CSV_FIELDS)
+            writer.writeheader()
+            for write_line in generator_object:
+                writer.writerow(write_line)
+            log.info("Finished writing to the file %s" % converted_csv_file)
+
+    return os.path.realpath(converted_csv_file)
+
+
+def get_year_wise_generator(year=None, namcs_raw_dataset_file=None,
+                            do_export = False):
+    """
+    Method to convert raw NAMCS PCD data into CSV, and return Dictionary
+    containing generator of converted raw NAMCS patient case data for
+    given year if `do_export` is false else dump the converted raw NAMCS
+    patient case data into CSV file as defined by
+    `CONVERTED_CSV_FILE_NAME_SUFFIX` for given year.
+
+    Parameters:
+        year (:class:`int` or :class:`tuple` or :class:`list`): NAMCS year
+            for which raw data needs to be translated.
+        namcs_raw_dataset_file (:class:`str`): File path for
+            raw dataset input file default value None.
+        do_export (:class:`bool`) : Flag to indicate if to dump the converted
+            raw NAMCS patient case data into CSV file as defined by
+            `CONVERTED_CSV_FILE_NAME_SUFFIX` for given year default value False.
+
+    Returns:
+        :class:`defaultdict` : Dictionary containing generator of converted
+            raw NAMCS patient case data for given year.
+    """
+    year_wise_mld = defaultdict(dict)
+    # If year not specified, get generator object
+    if year is None:
+        year = YEARS_AVAILABLE
+
+    year = get_iterable(year)
+    for _year in year:
+        year_wise_mld[_year]["generator"] = \
+            get_generator_by_year(_year, namcs_raw_dataset_file)
+        year_wise_mld[_year]["source_file_info"] = \
+            get_namcs_source_file_info(_year)
+
+    if do_export and year_wise_mld:
+        for _year in year:
+            year_wise_mld[_year]["file_name"] = \
+                export_to_csv(_year, year_wise_mld[_year]["generator"])
+
+    return year_wise_mld
+