Skip to content

Commit

Permalink
Backend: Adds sex breakdown tables for select cancer screenings (#3615)
Browse files Browse the repository at this point in the history
# Description and Motivation
<!--- bulleted, high level items. use keywords (eg "closes #144" or
"fixes #4323") -->

- closes #3604 
- adjusts utils to keep list of screenings for all sexes (lung,
colorectal) from those that don't have sex breakdown (prostate, breast,
cervical)
- adjusts utility functions to accept list of conditions as arg; these
are set conditionally based on the calling datasource (brfss/medicare)
and breakdown (sex / non-sex)
- updates golden data
- better naming for brfss dataset typing
- stronger typing

## Has this been tested? How?

- tests updated and passing


## Types of changes

(leave all that apply)

- New content or feature


## New frontend preview link is below in the Netlify comment 😎
  • Loading branch information
benhammondmusic authored Sep 3, 2024
1 parent a9f3da0 commit 5f00b00
Show file tree
Hide file tree
Showing 7 changed files with 283 additions and 70 deletions.
8 changes: 6 additions & 2 deletions python/datasources/phrma.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
BENEFICIARIES,
BREAKDOWN_TO_STANDARD_BY_COL,
load_phrma_df_from_data_dir,
PHRMA_MEDICARE_CONDITIONS,
PHRMA_MEDICARE,
)


Expand Down Expand Up @@ -63,7 +65,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
demo_type = self.get_attr(attrs, 'demographic')
geo_level = self.get_attr(attrs, 'geographic')

alls_df = load_phrma_df_from_data_dir(geo_level, TMP_ALL, 'standard')
alls_df = load_phrma_df_from_data_dir(geo_level, TMP_ALL, PHRMA_MEDICARE, PHRMA_MEDICARE_CONDITIONS)

table_name = f'{demo_type}_{geo_level}'
df = self.generate_breakdown_df(demo_type, geo_level, alls_df)
Expand Down Expand Up @@ -121,7 +123,9 @@ def generate_breakdown_df(

fips_to_use = std_col.COUNTY_FIPS_COL if geo_level == COUNTY_LEVEL else std_col.STATE_FIPS_COL

breakdown_group_df = load_phrma_df_from_data_dir(geo_level, demo_breakdown, 'standard')
breakdown_group_df = load_phrma_df_from_data_dir(
geo_level, demo_breakdown, PHRMA_MEDICARE, PHRMA_MEDICARE_CONDITIONS
)

df = pd.concat([breakdown_group_df, alls_df], axis=0)
df = df.replace(to_replace=BREAKDOWN_TO_STANDARD_BY_COL)
Expand Down
55 changes: 16 additions & 39 deletions python/datasources/phrma_brfss.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@
SCREENING_ELIGIBLE,
BREAKDOWN_TO_STANDARD_BY_COL,
load_phrma_df_from_data_dir,
AGE_ADJ_RATE_LOWER,
PHRMA_CANCER_PCT_CONDITIONS_WITH_SEX_BREAKDOWN,
PHRMA_BRFSS,
TMP_ALL,
get_age_adjusted_ratios,
)
import numpy as np

"""
NOTE: Phrma data comes in .xlsx files, with breakdowns by sheet.
Expand Down Expand Up @@ -71,16 +73,22 @@ def generate_breakdown_df(
demo_col = std_col.RACE_CATEGORY_ID_COL if demo_breakdown == std_col.RACE_OR_HISPANIC_COL else demo_breakdown
all_val = std_col.Race.ALL.value if demo_breakdown == std_col.RACE_OR_HISPANIC_COL else ALL_VALUE

alls_df = load_phrma_df_from_data_dir(geo_level, 'all', 'cancer')
conditions = (
PHRMA_CANCER_PCT_CONDITIONS_WITH_SEX_BREAKDOWN
if demo_breakdown == std_col.SEX_COL
else PHRMA_CANCER_PCT_CONDITIONS
)

alls_df = load_phrma_df_from_data_dir(geo_level, TMP_ALL, PHRMA_BRFSS, conditions)
alls_df[demo_col] = all_val

breakdown_group_df = load_phrma_df_from_data_dir(geo_level, demo_breakdown, 'cancer')
breakdown_group_df = load_phrma_df_from_data_dir(geo_level, demo_breakdown, PHRMA_BRFSS, conditions)

df = pd.concat([breakdown_group_df, alls_df], axis=0)
df = df.replace(to_replace=BREAKDOWN_TO_STANDARD_BY_COL)

# ADHERENCE rate
for condition in PHRMA_CANCER_PCT_CONDITIONS:
for condition in conditions:
source_col_name = f'{condition}_{ADHERENCE_RATE_LOWER}'
het_col_name = f'{condition.lower()}_{SCREENED}_{std_col.PCT_RATE_SUFFIX}'
df[het_col_name] = df[source_col_name].round()
Expand All @@ -94,7 +102,7 @@ def generate_breakdown_df(
# rename count cols
rename_col_map = {}
count_to_pct_share_map = {}
for condition in PHRMA_CANCER_PCT_CONDITIONS:
for condition in conditions:

# source cols
source_rate_numerator = f'{condition}_{COUNT_YES_LOWER}'
Expand All @@ -121,7 +129,7 @@ def generate_breakdown_df(
std_col.add_race_columns_from_category_id(df)

# generate pct share columns
if demo_breakdown in [std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL]:
if demo_breakdown in [std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL, std_col.SEX_COL]:
# all demographics are known
df = generate_pct_share_col_without_unknowns(
df,
Expand All @@ -142,39 +150,8 @@ def generate_breakdown_df(
)

if demo_breakdown == std_col.RACE_OR_HISPANIC_COL:
df = get_age_adjusted_ratios(df)
df = get_age_adjusted_ratios(df, conditions)

df = df.sort_values(by=[std_col.STATE_FIPS_COL, demo_col]).reset_index(drop=True)

return df


def get_age_adjusted_ratios(df: pd.DataFrame) -> pd.DataFrame:
"""Adds columns for age adjusted ratios (comparing each race's
rate to the rate for White NH) for each type of cancer screening."""

_tmp_white_rates_col = 'WHITE_NH_AGE_ADJ_RATE'

for condition in PHRMA_CANCER_PCT_CONDITIONS:
source_age_adj_rate_col = f'{condition}_{AGE_ADJ_RATE_LOWER}'
cancer_type = condition.lower()
het_age_adj_ratio_col = f'{cancer_type}_{SCREENED}_{std_col.RATIO_AGE_ADJUSTED_SUFFIX}'

# Step 1: Filter the DataFrame to get AGE_ADJ_RATE where RACE_ID is 'WHITE_NH'
white_nh_rates = df[df[std_col.RACE_CATEGORY_ID_COL] == std_col.Race.WHITE_NH.value].set_index(
std_col.STATE_FIPS_COL
)[source_age_adj_rate_col]

# Step 2: Map these values back to the original DataFrame based on STATE_FIPS
df[_tmp_white_rates_col] = df[std_col.STATE_FIPS_COL].map(white_nh_rates)

# Step 3: Calculate AGE_ADJ_RATIO by dividing AGE_ADJ_RATE by WHITE_NH_RATE
df[het_age_adj_ratio_col] = df[source_age_adj_rate_col] / df[_tmp_white_rates_col]
df[het_age_adj_ratio_col] = df[het_age_adj_ratio_col].round(2)

df = df.drop(columns=[_tmp_white_rates_col, source_age_adj_rate_col])

# for rows where RACE is ALL set AGE_ADJ_RATIO to np.nan
df.loc[df[std_col.RACE_CATEGORY_ID_COL] == std_col.Race.ALL.value, het_age_adj_ratio_col] = np.nan

return df
2 changes: 2 additions & 0 deletions python/ingestion/het_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
'age', 'sex', 'race_and_ethnicity', 'lis', 'eligibility', 'insurance_status', 'education', 'income', 'all'
]

PHRMA_DATASET_TYPE = Literal["brfss", "medicare"]

HIV_BREAKDOWN_TYPE = Literal['age', 'sex', 'race', 'race_and_ethnicity', 'black_women']
WISQARS_DEMO_TYPE = Literal["sex", "age", "race_and_ethnicity", "urbanicty", "all"]

Expand Down
71 changes: 54 additions & 17 deletions python/ingestion/phrma_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from ingestion.het_types import GEO_TYPE, PHRMA_BREAKDOWN_TYPE_OR_ALL, SEX_RACE_ETH_AGE_TYPE
from ingestion.het_types import GEO_TYPE, PHRMA_BREAKDOWN_TYPE_OR_ALL, SEX_RACE_ETH_AGE_TYPE, PHRMA_DATASET_TYPE
from ingestion import gcs_to_bq_util, dataset_utils
import ingestion.standardized_columns as std_col
from ingestion.constants import STATE_LEVEL, COUNTY_LEVEL, NATIONAL_LEVEL, US_FIPS
import pandas as pd
from typing import Dict, Literal, cast
import numpy as np
from typing import Dict, cast, List
from ingestion.merge_utils import merge_dfs_list

TMP_ALL = 'all'
PHRMA_BRFSS: PHRMA_DATASET_TYPE = 'brfss'
PHRMA_MEDICARE: PHRMA_DATASET_TYPE = 'medicare'

TMP_ALL: PHRMA_BREAKDOWN_TYPE_OR_ALL = 'all'
PHRMA_DIR = 'phrma'

ADHERENCE = 'adherence'
Expand All @@ -20,6 +24,7 @@
AGE_ADJ_RATE_LOWER = "age_adjusted_pct"
RACE_NAME_LOWER = "race_name"
AGE_GROUP_LOWER = "age_group"
SEX_NAME_LOWER = "sex_name"
INSURANCE_STATUS_LOWER = "insurance_status"
INCOME_GROUP_LOWER = "income_group"
EDUCATION_GROUP_LOWER = "education_group"
Expand Down Expand Up @@ -63,7 +68,11 @@
std_col.SCHIZOPHRENIA_PREFIX,
]

PHRMA_CANCER_PCT_CONDITIONS = ["Breast", "Cervical", "Colorectal", "Lung", "Prostate"]
PHRMA_MEDICARE_CONDITIONS = [*PHRMA_PCT_CONDITIONS, *PHRMA_100K_CONDITIONS]

PHRMA_CANCER_PCT_CONDITIONS_WITH_SEX_BREAKDOWN = ["Colorectal", "Lung"]
PHRMA_CANCER_PCT_CONDITIONS = ["Breast", "Cervical", "Prostate"] + PHRMA_CANCER_PCT_CONDITIONS_WITH_SEX_BREAKDOWN


BREAKDOWN_TO_STANDARD_BY_COL = {
std_col.AGE_COL: {
Expand Down Expand Up @@ -213,6 +222,7 @@ def rename_cols(

if breakdown == std_col.SEX_COL:
rename_cols_map[SEX_NAME] = std_col.SEX_COL
rename_cols_map[SEX_NAME_LOWER] = std_col.SEX_COL

if breakdown == std_col.ELIGIBILITY_COL:
rename_cols_map[ENTLMT_RSN_CURR] = std_col.ELIGIBILITY_COL
Expand Down Expand Up @@ -244,13 +254,14 @@ def rename_cols(
def load_phrma_df_from_data_dir(
geo_level: GEO_TYPE,
breakdown: PHRMA_BREAKDOWN_TYPE_OR_ALL,
data_type: Literal['standard', 'cancer'],
dataset_type: PHRMA_DATASET_TYPE,
conditions: List[str],
) -> pd.DataFrame:
"""Generates Phrma data by breakdown and geo_level
geo_level: string equal to `county`, `national`, or `state`
breakdown: string equal to `age`, `race_and_ethnicity`, `sex`, `lis`, `eligibility`,
`insurance_status`, `education`, `income`, or `all`
data_type: string equal to 'standard' or 'cancer' to determine which data to process
dataset_type: string equal to PHRMA_MEDICARE or PHRMA_BRFSS to determine which data to process
return: a single data frame of data by demographic breakdown and
geo_level with data columns loaded from multiple Phrma source tables"""

Expand All @@ -268,9 +279,9 @@ def load_phrma_df_from_data_dir(
fips_col = std_col.COUNTY_FIPS_COL if geo_level == COUNTY_LEVEL else std_col.STATE_FIPS_COL

breakdown_het_to_source_type = {
"age": AGE_GROUP if data_type == 'standard' else AGE_GROUP_LOWER,
"race_and_ethnicity": RACE_NAME if data_type == 'standard' else RACE_NAME_LOWER,
"sex": SEX_NAME,
"age": AGE_GROUP if dataset_type == PHRMA_MEDICARE else AGE_GROUP_LOWER,
"race_and_ethnicity": RACE_NAME if dataset_type == PHRMA_MEDICARE else RACE_NAME_LOWER,
"sex": SEX_NAME if dataset_type == PHRMA_MEDICARE else SEX_NAME_LOWER,
"lis": LIS,
"eligibility": ENTLMT_RSN_CURR,
"income": INCOME_GROUP_LOWER,
Expand All @@ -290,20 +301,15 @@ def load_phrma_df_from_data_dir(
keep_cols.append(COUNTY_FIPS)
if geo_level == STATE_LEVEL:
fips_length = 2
keep_cols.append(STATE_FIPS if data_type == 'standard' else STATE_FIPS_LOWER)
keep_cols.append(STATE_FIPS if dataset_type == PHRMA_MEDICARE else STATE_FIPS_LOWER)
if geo_level == NATIONAL_LEVEL:
fips_length = 2

topic_dfs = []
condition_keep_cols = []

if data_type == 'standard':
conditions = [*PHRMA_PCT_CONDITIONS, *PHRMA_100K_CONDITIONS]
else: # cancer
conditions = PHRMA_CANCER_PCT_CONDITIONS

for condition in conditions:
if data_type == 'standard':
if dataset_type == PHRMA_MEDICARE:
if condition in PHRMA_PCT_CONDITIONS:
condition_keep_cols = [*keep_cols, COUNT_YES, COUNT_TOTAL, ADHERENCE_RATE]
elif condition in PHRMA_100K_CONDITIONS:
Expand All @@ -326,7 +332,7 @@ def load_phrma_df_from_data_dir(
if breakdown == std_col.RACE_OR_HISPANIC_COL:
condition_keep_cols.append(AGE_ADJ_RATE_LOWER)

if data_type == 'standard':
if dataset_type == PHRMA_MEDICARE:
file_name = f'{condition}-{sheet_name}.csv'
subdirectory = condition
else: # cancer
Expand Down Expand Up @@ -364,3 +370,34 @@ def load_phrma_df_from_data_dir(
df_merged = dataset_utils.ensure_leading_zeros(df_merged, fips_col, fips_length)

return df_merged


def get_age_adjusted_ratios(df: pd.DataFrame, conditions: List[str]) -> pd.DataFrame:
"""Adds columns for age adjusted ratios (comparing each race's
rate to the rate for White NH) for each type of cancer screening."""

_tmp_white_rates_col = 'WHITE_NH_AGE_ADJ_RATE'

for condition in conditions:
source_age_adj_rate_col = f'{condition}_{AGE_ADJ_RATE_LOWER}'
cancer_type = condition.lower()
het_age_adj_ratio_col = f'{cancer_type}_{SCREENED}_{std_col.RATIO_AGE_ADJUSTED_SUFFIX}'

# Step 1: Filter the DataFrame to get AGE_ADJ_RATE where RACE_ID is 'WHITE_NH'
white_nh_rates = df[df[std_col.RACE_CATEGORY_ID_COL] == std_col.Race.WHITE_NH.value].set_index(
std_col.STATE_FIPS_COL
)[source_age_adj_rate_col]

# Step 2: Map these values back to the original DataFrame based on STATE_FIPS
df[_tmp_white_rates_col] = df[std_col.STATE_FIPS_COL].map(white_nh_rates)

# Step 3: Calculate AGE_ADJ_RATIO by dividing AGE_ADJ_RATE by WHITE_NH_RATE
df[het_age_adj_ratio_col] = df[source_age_adj_rate_col] / df[_tmp_white_rates_col]
df[het_age_adj_ratio_col] = df[het_age_adj_ratio_col].round(2)

df = df.drop(columns=[_tmp_white_rates_col, source_age_adj_rate_col])

# for rows where RACE is ALL set AGE_ADJ_RATIO to np.nan
df.loc[df[std_col.RACE_CATEGORY_ID_COL] == std_col.Race.ALL.value, het_age_adj_ratio_col] = np.nan

return df
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sex,colorectal_screened_estimated_total,colorectal_screening_eligible_estimated_total,state_fips,lung_screened_estimated_total,lung_screening_eligible_estimated_total,colorectal_screened_pct_rate,lung_screened_pct_rate,state_name,colorectal_screened_pct_share,colorectal_screening_eligible_population_pct,lung_screened_pct_share,lung_screening_eligible_population_pct
All,152617,217193,00,4811,16206,66.0,28.0,United States,100.0,100.0,100.0,100.0
Female,82352,115873,00,2249,7484,68.0,28.0,United States,54.0,53.4,46.7,46.2
Male,70265,101320,00,2562,8722,65.0,28.0,United States,46.0,46.6,53.3,53.8
Loading

0 comments on commit 5f00b00

Please sign in to comment.