From 51edb056f1b124c54e1c8d757c6ec14c6fc25126 Mon Sep 17 00:00:00 2001 From: aysim319 Date: Tue, 14 Jan 2025 10:03:57 -0500 Subject: [PATCH 1/7] 2088 patch nhsn (#2094) * in progress * changed insertions to be weekday index * finishing up test and cleaning up * lint * more test * fixed tests * more test fix * fixed pull test * Update nhsn/delphi_nhsn/patch.py docs Co-authored-by: minhkhul <118945681+minhkhul@users.noreply.github.com> * changed so only the latest day in epiweek gets patched in * making dataset_id into constants * lint --------- Co-authored-by: minhkhul <118945681+minhkhul@users.noreply.github.com> --- nhsn/delphi_nhsn/constants.py | 3 + nhsn/delphi_nhsn/patch.py | 93 ++++++++++++++ nhsn/delphi_nhsn/pull.py | 59 ++++++++- nhsn/delphi_nhsn/run.py | 24 ++-- nhsn/tests/conftest.py | 11 +- nhsn/tests/test_data/20241212.csv.gz | Bin 0 -> 3446 bytes nhsn/tests/test_data/20241212_prelim.csv.gz | Bin 0 -> 3410 bytes nhsn/tests/test_patch.py | 130 ++++++++++++++++++++ nhsn/tests/test_pull.py | 47 +++++-- nhsn/tests/test_run.py | 6 + 10 files changed, 344 insertions(+), 29 deletions(-) create mode 100644 nhsn/delphi_nhsn/patch.py create mode 100644 nhsn/tests/test_data/20241212.csv.gz create mode 100644 nhsn/tests/test_data/20241212_prelim.csv.gz create mode 100644 nhsn/tests/test_patch.py diff --git a/nhsn/delphi_nhsn/constants.py b/nhsn/delphi_nhsn/constants.py index e6e6e4359..d51241b4f 100644 --- a/nhsn/delphi_nhsn/constants.py +++ b/nhsn/delphi_nhsn/constants.py @@ -2,6 +2,9 @@ GEOS = ["state", "nation", "hhs"] +MAIN_DATASET_ID = "ua7e-t2fy" +PRELIM_DATASET_ID = "mpgq-jmmr" + # column name from socrata TOTAL_ADMISSION_COVID_API = "totalconfc19newadm" TOTAL_ADMISSION_FLU_API = "totalconfflunewadm" diff --git a/nhsn/delphi_nhsn/patch.py b/nhsn/delphi_nhsn/patch.py new file mode 100644 index 000000000..31fae3070 --- /dev/null +++ b/nhsn/delphi_nhsn/patch.py @@ -0,0 +1,93 @@ +""" +This module is used for patching data in the delphi_nhsn package. + +To use this module, you need to specify the range of issue dates in params.json, like so: + +{ + "common": { + ... + }, + "validation": { + ... + }, + "patch": { + "patch_dir": "/Users/minhkhuele/Desktop/delphi/covidcast-indicators/nhsn/patch" + } +} + +It will generate data for the range of issue dates corresponding to source data files available in "backup_dir" +specified under "common", and store them in batch issue format under "patch_dir": +[name-of-patch]/issue_[issue-date]/nhsn/actual_data_file.csv +""" + +from datetime import datetime +from os import makedirs +from pathlib import Path +from typing import List + +from delphi_utils import get_structured_logger, read_params +from epiweeks import Week + +from .run import run_module + + +def filter_source_files(source_files: List[Path]): + """ + Filter patch files such that each element in the list is an unique epiweek with the latest issue date. + + Parameters + ---------- + source_files + + Returns + ------- + list of issue dates + + """ + epiweek_dict = dict() + + for file in source_files: + if "prelim" not in file.stem: + current_issue_date = datetime.strptime(file.name.split(".")[0], "%Y%m%d") + epiweek = Week.fromdate(current_issue_date) + epiweek_dict[epiweek] = file + + filtered_patch_list = list(epiweek_dict.values()) + return filtered_patch_list + + +def patch(params): + """ + Run the doctor visits indicator for a range of issue dates. + + The range of issue dates is specified in params.json using the following keys: + - "patch": Only used for patching data + - "patch_dir": str, directory to write all issues output + """ + logger = get_structured_logger("delphi_nhsn.patch", filename=params["common"]["log_filename"]) + + source_files = sorted(Path(params["common"]["backup_dir"]).glob("*.csv.gz")) + makedirs(params["patch"]["patch_dir"], exist_ok=True) + + logger.info( + "Starting patching", + patch_directory=params["patch"]["patch_dir"], + start_issue=source_files[0].name.split(".")[0], + end_issue=source_files[-1].name.split(".")[0], + ) + + patch_list = filter_source_files(source_files) + for file in patch_list: + issue_date = datetime.strptime(file.name.split(".")[0], "%Y%m%d") + current_issue_ew = Week.fromdate(issue_date) + logger.info("Running issue", issue_date=issue_date.strftime("%Y-%m-%d")) + params["patch"]["issue_date"] = issue_date.strftime("%Y%m%d") + current_issue_dir = f"{params['patch']['patch_dir']}/issue_{current_issue_ew}/nhsn" + makedirs(current_issue_dir, exist_ok=True) + params["common"]["export_dir"] = current_issue_dir + params["common"]["custom_run"] = True + run_module(params, logger) + + +if __name__ == "__main__": + patch(read_params()) diff --git a/nhsn/delphi_nhsn/pull.py b/nhsn/delphi_nhsn/pull.py index 2e1114142..7377ef958 100644 --- a/nhsn/delphi_nhsn/pull.py +++ b/nhsn/delphi_nhsn/pull.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- """Functions for pulling NSSP ER data.""" import logging +from pathlib import Path from typing import Optional import pandas as pd from delphi_utils import create_backup_csv from sodapy import Socrata -from .constants import PRELIM_SIGNALS_MAP, PRELIM_TYPE_DICT, SIGNALS_MAP, TYPE_DICT +from .constants import MAIN_DATASET_ID, PRELIM_DATASET_ID, PRELIM_SIGNALS_MAP, PRELIM_TYPE_DICT, SIGNALS_MAP, TYPE_DICT def pull_data(socrata_token: str, dataset_id: str): @@ -27,7 +28,42 @@ def pull_data(socrata_token: str, dataset_id: str): return df -def pull_nhsn_data(socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None): +def pull_data_from_file(filepath: str, issue_date: str, logger, prelim_flag=False) -> pd.DataFrame: + """ + Pull data from source file. + + The source file is generated from delphi_utils.create_backup_csv + Parameters + ---------- + filepath: full path where the source file is located + issue_date: date when the file was pulled / generated + logger + prelim_flag: boolean to indicate which dataset to grab + + Returns + ------- + pd.DataFrame + Dataframe as described above. + """ + df = pd.DataFrame() + if issue_date: + issue_date = issue_date.replace("-", "") + filename = f"{issue_date}_prelim.csv.gz" if prelim_flag else f"{issue_date}.csv.gz" + backup_file = Path(filepath, filename) + + if backup_file.exists(): + df = pd.read_csv(backup_file, compression="gzip") + logger.info("Pulling data from file", file=filename, num_rows=len(df)) + return df + + +def pull_nhsn_data( + socrata_token: str, + backup_dir: str, + custom_run: bool, + issue_date: Optional[str], + logger: Optional[logging.Logger] = None, +): """Pull the latest NHSN hospital admission data, and conforms it into a dataset. The output dataset has: @@ -52,7 +88,11 @@ def pull_nhsn_data(socrata_token: str, backup_dir: str, custom_run: bool, logger Dataframe as described above. """ # Pull data from Socrata API - df = pull_data(socrata_token, dataset_id="ua7e-t2fy") + df = ( + pull_data(socrata_token, dataset_id=MAIN_DATASET_ID) + if not custom_run + else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=False) + ) keep_columns = list(TYPE_DICT.keys()) @@ -75,7 +115,11 @@ def pull_nhsn_data(socrata_token: str, backup_dir: str, custom_run: bool, logger def pull_preliminary_nhsn_data( - socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None + socrata_token: str, + backup_dir: str, + custom_run: bool, + issue_date: Optional[str], + logger: Optional[logging.Logger] = None, ): """Pull the latest preliminary NHSN hospital admission data, and conforms it into a dataset. @@ -100,8 +144,11 @@ def pull_preliminary_nhsn_data( pd.DataFrame Dataframe as described above. """ - # Pull data from Socrata API - df = pull_data(socrata_token, dataset_id="mpgq-jmmr") + df = ( + pull_data(socrata_token, dataset_id=PRELIM_DATASET_ID) + if not custom_run + else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=True) + ) keep_columns = list(PRELIM_TYPE_DICT.keys()) diff --git a/nhsn/delphi_nhsn/run.py b/nhsn/delphi_nhsn/run.py index 80f7cab47..15f5559c5 100644 --- a/nhsn/delphi_nhsn/run.py +++ b/nhsn/delphi_nhsn/run.py @@ -25,7 +25,7 @@ from .pull import pull_nhsn_data, pull_preliminary_nhsn_data -def run_module(params): +def run_module(params, logger=None): """ Run the indicator. @@ -35,14 +35,16 @@ def run_module(params): Nested dictionary of parameters. """ start_time = time.time() - logger = get_structured_logger( - __name__, - filename=params["common"].get("log_filename"), - log_exceptions=params["common"].get("log_exceptions", True), - ) + if not logger: + logger = get_structured_logger( + __name__, + filename=params["common"].get("log_filename"), + log_exceptions=params["common"].get("log_exceptions", True), + ) export_dir = params["common"]["export_dir"] backup_dir = params["common"]["backup_dir"] custom_run = params["common"].get("custom_run", False) + issue_date = params.get("patch", dict()).get("issue_date", None) socrata_token = params["indicator"]["socrata_token"] export_start_date = params["indicator"]["export_start_date"] run_stats = [] @@ -51,12 +53,16 @@ def run_module(params): export_start_date = date.today() - timedelta(days=date.today().weekday() + 2) export_start_date = export_start_date.strftime("%Y-%m-%d") - nhsn_df = pull_nhsn_data(socrata_token, backup_dir, custom_run=custom_run, logger=logger) - preliminary_nhsn_df = pull_preliminary_nhsn_data(socrata_token, backup_dir, custom_run=custom_run, logger=logger) + nhsn_df = pull_nhsn_data(socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger) + preliminary_nhsn_df = pull_preliminary_nhsn_data( + socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger + ) geo_mapper = GeoMapper() signal_df_dict = {signal: nhsn_df for signal in SIGNALS_MAP} - signal_df_dict.update({signal: preliminary_nhsn_df for signal in PRELIM_SIGNALS_MAP}) + # some of the source backups do not include for preliminary data TODO remove after first patch + if not preliminary_nhsn_df.empty: + signal_df_dict.update({signal: preliminary_nhsn_df for signal in PRELIM_SIGNALS_MAP}) for signal, df_pull in signal_df_dict.items(): for geo in GEOS: diff --git a/nhsn/tests/conftest.py b/nhsn/tests/conftest.py index 525d8ae7e..b89946a02 100644 --- a/nhsn/tests/conftest.py +++ b/nhsn/tests/conftest.py @@ -16,10 +16,10 @@ # queries the nhsn data with timestamp (2021-08-19, 2021-10-19) with CO and USA data -with open("test_data/page.json", "r") as f: +with open(f"{TEST_DIR}/test_data/page.json", "r") as f: TEST_DATA = json.load(f) -with open("test_data/prelim_page.json", "r") as f: +with open(f"{TEST_DIR}/test_data/prelim_page.json", "r") as f: PRELIM_TEST_DATA = json.load(f) @pytest.fixture(scope="session") @@ -50,11 +50,12 @@ def params(): @pytest.fixture def params_w_patch(params): params_copy = copy.deepcopy(params) + params_copy["common"]["custom_run"] = True params_copy["patch"] = { - "start_issue": "2024-06-27", - "end_issue": "2024-06-29", - "patch_dir": "./patch_dir" + "patch_dir": f"{TEST_DIR}/patch_dir", + "issue_date": "2024-12-12", } + return params_copy @pytest.fixture(scope="function") diff --git a/nhsn/tests/test_data/20241212.csv.gz b/nhsn/tests/test_data/20241212.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..61c85a5ba35e757dd53d819a9854aec78d534a17 GIT binary patch literal 3446 zcmV-+4T}$#Uh!5k>dZ*hsUR%@9(~E_dnhJ`0})^c)Y*c9)7*OJse(*K;1m(_Qx`R z+g&@i<9)|@^5y8*eVzLD>%%X%Qh)pU{qgbB(>m{=p1q&GK7IOfs9!68`SN3_T*u@0 z)@ipB^oHH|t2*?1xc~Fh-5>jtZFM%Ml&7zu4n`a=q}Bd-_t^IQ~y4rfXpj~JiLc)q=c$0Nq4Kb~)|-tmZUw`1)dTZWJFTEm0ZjgAMc z8!eyAd@ZdjO`ps=7UlJ?efs(P?Qg%HDts*P&7je}>CI~C_;UHi#RPhWw}&W-K6!BgIeKSn1a6~AN=Gl8ZtXcNd-sZ zed3`{R)|JFNx0M`))+ZK#B`k2Cc*Ea-EktXtmy@7d|-9u&?1~&xfi*0_MeI89->!s@MFQe;Px?YPxt;Sv4NngyW)xS{WlIwg9*EO!n+)#C@zSr}o zr=^$MkAK7b|F{2B1^w$Ef4$`BB;xeyCFL4}o1$DIBoZg5mzdQD>_HHk@N_207q3O! z`FJU{KvY^F#8~6xp|T*NoU&?$WHn$o1}Eq)IQ)JoP?16# z5NqPymURbb5K%ALB$$1|89@so3 zQtvO(*EHDpO9;4bx9msmYqmyh8|tvPxYqN9@%ZJyac4zIJmH*Y^~@D1EqW@_8X|7t zRN|`6JT1Kzb-s0amrphlwmqbt^zZ+2JQ3XXJSf&#mzuhlZW+MOYn*W6)fF-7K`xjm zV>ar^;747h%xWoFNG)hvsVASc;SbgL&6>lcV*EqrgfUTZ#GXLdB<<1_8VqGFOplcm zbi{0NRG{tg^|?sLUA2o_ZY?U*W(#rQUo^P*kU#kMp8P8V{)JSWJ2*l3~#x zotS;#r7PceA0zj?cesAMZz45nnM90#o>dCqz|;b2pc`ytykwOGQ%ot<>L*_yTX7{% znpu@hb529?rmIpWnAfa8Z4B!UCWX#580sKClUv2q6knvV$6<)o2$-}J(C)NA=Nkkj zPtmEC&c$d9Y{netPDaJSM!_xG=)Ui3X$-VRX&YmpuX$zQ3*(JPCtKrL&Ouf}wHm(y zF!!$B0O+>>^x+zSZZ|cDIP>F3G{Vtk_NmY#8)Il-wSkh@rHHV}rX=@3 z#a`G9zL-}?C00|Sd6lmHNaJ4>yYIO@GH1SD${a5-^*S?C$!;0mnGYzw7Zk6R)Sws& zye@-c(31Fc0YV4|=Nq$B*i5`)5upRrKw}CP!FwYU;oj4|BLNe4)Fn)n_z=}YpLi7J zt7QVPz|MLq*WqU3CP-9Mm$;B%#Up7;ClO&zEu`1$i7FzvyoGvmyCGt;I; z?XhutAZ0N`8D~v}=Gi{mV~NFXNRST_z9$J~6dnhqfwb^gJ^U~84gnoF^)Z!#SQKue zYNT_BhYt{S-%#P4DP2PcLeewLkRyyUKojWG;gu=~jX^;WwkgVqfVJQd?5=jtFax>; zfOU0(O7L={FR423;CDS2cv4&EvDmoa%q%tz77dpoW{K6LNDQ{%*q3c^+@oVZTi}kt z+At{3c!r+OY}ro@gW<|E%x9kA_4rqKhRz>+w^Z}$8RiquF!4#g!ZSRb{YB5vA6&uB zaX57i{p(%B&3uQtRmY=PnF`@5pT9{rcJR5ZV^#sw5PG)YGJ zfZ=<>a2P9lh8XY#0K@DvxE4hsLR9SJjD1@w^p5U6En*;YLc<}X#pMl`;lBZG>Z$|T z$iv{-(s&36N$}z(dM}KMNy7j+DsF5nFzH;;L6}~-3L_9Rn7Ztw+?J;^ok2x3A@FY7eBguvt@EpUHT)ZK<&CVCXGO&V7|Cner?eA*jk=69KQ3X1rlv8VId1v`qtH z(iMa+j5jdFX9{Ce*`W;AyKx3XgIF;VmMsbMz!`kYJbV&;pQhR=0 z^n-lwNWPx7p*I4TTw6s~%u!G}Y~RblY4y3pbV z6-PXwMg+lbf)1UMck~0nI|wNNbledx|G~#Au@v(H;SJJEYp4u+!@3ijM)8g^h$$@# zl47>RoKKl1Hi2p_i5EKf?Cm1Jw+n<83`p0Gn>Kib1!TmhMYp!; z@GC3)UKnr8@#lrT4tae)=8Ce?Z;5J2zqF-a8PM+yvp??#@~-Z~-1@=-;WFYSC(v>u zvg=~SCS4-##_0%y@Yle*D%E(G8pARJ6L?uEDpO=&Rc_&}Ye{4JfL!5m(jsEOuG*4K zwFsrTm4!%!wvC)zMW1~@F}nA{tZ-dXsM0UCX176M3rk9vZ++J01ZGKlOXVV8qYo6m z4+;k|YAAFPO0%Oi06feS!!p}ya!ipz@ysEY5Q05?4-bb~VZhKylS^fWjDm61drH65 zOe(gFF^nqX2`26;{6fo0b8V3x8=h9k45@)Zez?HIFy(_6b=irw=A42pKS%JES&|ft zdtMWZ2zgt`9Q~XxwvQ6}i`0y7PA$n!ub4|Y%zh)254i7U$1hVkOy@n3JPCT`sn6{YJ9xpRvWN9%DR~HGI z_F;^WtCb&+57N; z!1qL89U#y%vgIOPN?hJIvxdl)zFCab$70KbnwZ}O{{qY%lN*wNNBcgeB<9J$F2@gL z$iEDyZ{oKGDK^>x5-`{NhKn9#;$k2IW(3M96p2Ko8E(BWwsyAYIPXNW^H8d9OWLOT z=zO*?gv51R#SF&7&^4E|Yi{?&jJ^J{PmNd#8G#pmDBzA9H$)ehl`qRa%{Ta0wi%GV zCRk_wy*S>OQ=ch(<5AxGbMWB8%D*zBivZLu0o}kKeB=D*sSzo=)QG;68c}&=YGj>w z%^k;`ekF}K*pLQK&;kQk#Kytc{KDFjyV^dGDU41ir_j>JoqN7zcOe)S*}`H*$7E!+ z0j?#C+hoBEe=m%KmMHc%iC8FX6RDTYa$pl)&2h~X=U@pgscuDKZJ!UikIx6+6NN`9 z844ASo2?2bI7>iS;F>?)US@n_OhxQ_0ytraj_KCg&-1tHa&Y^8dEhkp{B3*qGWGqJr=RYn{_^G9^Yf>dW!_CYkAC^`^69rt z`_lN+Z{O#}WjxNeOuL?-cU+9$wc)s@$3H$j{C;_{t`I z;v#y}GoJ41QQ_-!;nTzJO}y?^)9MMV#`Nd68gkjPx$JH0MmxOEUmu@0T3NTY+4n80 z!$wbY*SC78yS}gA{(IQBJmArkM_rilsB3*Ijyvt{9e0ne<#^V)b3E(ZJ^C|_XPrC8 zv(DX9&!rC6t-B|>*XX8aCDlz2@BFxDQSV@~VMlw42ReOz{CYo@@3v>++NgDN_(;m^ zAL{tBV`s0x!3TZcctS&?@G%DGw-Ff)AYf-V^QwU_UXrO_rLsnsPM7CSA$0PrgtBS{oisZ z_jT7s@awLPKGgG}mVZ#Ea_7pO%0Hd^i~T!0=T3k7`$?P@J!d&pSKV3bDF?N}xo{5t zlzi}KchZn~kd##LO}tP1>XQ|s(Vr!pY7*;=oFHO4PHU6keCT(a$SYfV!4@CbTsgD| zXIIuDKb`$gMU>T8!qk#oK_MEZmYSTf>T5k!z4(`L=u*1gib1W#o!r5(m{qHPrN|}M z=^FMeuF7u2^k=P(bC)x`;SB$`|A!j-*FXMx%F#*0<<(QlH3oN%a*7a1JUczbtUjOy zNod0JIZHlyE#l5TK{_r&QuuqSIPcEkq!Of_eLR&~AS*49VyyA(p|c>OoU&?$XfR#@>zF@hc`iq}W&9iG#|_G_7P329>ecacr=+b+-*$#Y)MWrhukl|ErP6%XhDJ2woHd8Dim?#~Cya@TZ#)usJM*@oU7^lU zroxn3Nk~To7e{ScjX$iDbQD(m<5HU0NGV~qrch&JH12pj-w!svD;vvzjUg3h&okv% zN+9*3WUO}(Ps~1$Pf8Z}7+Le)<#I0ZBvPZ6vxu?Lvq=FXEW`jF$cGg)p0Y~9LQOH% z>d(GFxZ+AaYi3h2)j18to3ctdWAV)j49BokFe#L-!7vEru-tU0KSG;jgCn> z9qmpFl)l1a@|2xw>CBAA$F|ht+R3jt_)sv7D|oN_S{fg%Q`(2|(bv52@s;sLqjM6vJ_UDa2kV{qmG$vdLcWgb(ZRd&YEuxkS)@sJ{NwlxM* zl8RtvFKh;1%uiCK)l_PJ)U}f|I#w~Cn2T@v6hkJ*OH92@4Vre#@O%0|=KCRYucd~} zP~g`LnL$h9^93d$9b9kBQn99bKnr09^uWayq=N4wF%cTj7?O{P6?F+yB|b*=P$(XT zxx7p~7Ocab+I4_1Zh}NLeTfSRt9&GjX)h5X)j~GU6#x_PRW9TtVk*__WXvRGsE*>R zUW1sOyaZ-&EAN=IITD2-T?4Z@7X0m33}uI8uOAso9zQme`|&p#$_0>9i_}{TrQYTz zuS`FY?Xb9;@1eWWv@0%%x3iRQJIO3<8AaQ^;j9W>AKWB`L`*_4#H4RQxJL*?=#|{V zZG$P+KC_x$Oz)dYj3-V*9iPlO(L5t>~=V+HMln9&UT%N(nMO zIb1b`o2NzH02zzjkRTt3d|yP$DB=#vwbLSQ^{{PbDnTPe12L6?@D}b;HPTb!@q-z$4JxH04A9VE_tCvUWq6LE{4Qx_X9A5YFRR z;&!Uwyq>#8scjxvxLt5%7H$U%uuBnJq}9YoKDKz(lzzr#jLT!|TM8x34b4v0I71|dCf6t*sC-{k4g+w{*u)D_ znSBQJqDVyCi%QO@+uo#ij1#zG1_CKGSVCOfHE|iX58zW*9k@qY2G5>GOCU;u7l)~9 zV5m)21_)B2u2`_-bE62cf92+mK+YiUTuHg@h-Z38MHoVv(Jb<>LsV<3uPqkWB;8CU zTasIFIiu2B?4A4OmpL)3+rDHOd_BZvOfB64 z5i(i<30Y(;_O_oG#kL;3Ya5aU0kVNnEwSTWYG6#Z$5UGdCt7+3E3zGGSTxzTzr_1I zf7*8oYb@4jakfQ2)h{WV>TM0jioame7Wb0o=M0--+7$hu;=59DE22h4@satKl!g8g zQj9KRNp->y8(#990fOMDF|4w%`FP0CjTCM~gPqld7IUaLq6;k|h@TUb=#(s^g9Hm9 zr2q)BBHV9;J1nsj^N;Y6T*Nh0M$%!a#HP|3C@m>1C6i*dJfTmS3vdGQT23)^DBACf za9{68T6`@nKREkGZT`_9mIoh!o=+Rt!VEd0)dFVw=wK`h7+)E0I4V~P-!6CsFTcT= zcI#-7aW57)<~wBg|A@sJ1n-5d z-*rKyO8;YPb}Ll2aHoU`5Lugn%yW7RzXDr0B7bBbsC+L}4usTD=_Hi4($)YlF{zEU z+qSS{ij<0H!n=eJ?D2cJBc`wcOD9b*l}S5F##Qf`(qQ9LqkX+Pe(nr=Bl(sSEV8a6KH8>WRp*+vEi%p?z7P*O zMTE%k_#Odp!(->6TDZ-1oXtKLi@v4BPTbWbhT1zmW+vfMn~h|yre(Z)VZgMUw(VN- zf_i-23`Ngy8AVWe6}!#Ja*hY4Mxk*RhP7@?&9Si z3#a8WS7RA(1a3G{z#Z9s5Nse?KFxJ5@1SE@r)2tGWS!{v>UhIcI4;cMbw|8%f;kmk z=;(J@n8i!hL7@G{Gu)Q0k+RFA=&k7*l^fGFhbNhpEosEahC6sj3plWnHb%zgC~M0^ zYkSwHFzTVuLd$h`Ci<3&hLl)j3%D6~lb_WFxR%{+lLa#jy)Z^vR@>VoVnMb|q+V8= zkWG>`$0eL-jaM@Om4+wm{eFk+l@}JS-@Lh9s<8?rl(^}MeOwl z*f+%8G=x4Ttb_Q;i-f?}%Mf>fY2+iEpR@OWz)Nte5WOX5dO-<{bl_QH46v;@T?ts) zL8qilKEeT3!()NGnKbVu{H(W^Fw9FzG{J>{Alu&9unb{{)iRPyt}ky$x}0SVN!#?1 zo*!(+a^oS#Y3<-;J0RelC`+!;498+K!Cr)t4k+Im2DwPrP0> oWQp!GDI#@{wOPtFE@iVb`6tiy!~XyP0RR6300sRlsxvtN09C@mlK=n! literal 0 HcmV?d00001 diff --git a/nhsn/tests/test_patch.py b/nhsn/tests/test_patch.py new file mode 100644 index 000000000..639d5d262 --- /dev/null +++ b/nhsn/tests/test_patch.py @@ -0,0 +1,130 @@ +import glob +import os +from collections import defaultdict +from pathlib import Path +import shutil +from unittest.mock import patch as mock_patch + +import pandas as pd +from datetime import datetime, timedelta + +from epiweeks import Week + +from delphi_nhsn.patch import filter_source_files, patch +from delphi_nhsn.constants import TOTAL_ADMISSION_COVID_API, TOTAL_ADMISSION_FLU_API +from conftest import TEST_DATA, PRELIM_TEST_DATA, TEST_DIR + +class TestPatch: + + def generate_date_list(self, start_date, end_date): + # Generate a list of dates + date_list = [] + current_date = start_date + + while current_date <= end_date: + date_list.append(current_date.strftime('%Y%m%d')) + current_date += timedelta(days=1) + return date_list + + def generate_dummy_file_names(self): + start_date = datetime(2024, 8, 1) + end_date = datetime(2024, 8, 4) + date_list_part1 = self.generate_date_list(start_date, end_date) + + start_date = datetime(2024, 9, 6) + end_date = datetime(2024, 9, 10) + date_list_part2 = self.generate_date_list(start_date, end_date) + + start_date = datetime(2024, 10, 6) + end_date = datetime(2024, 10, 15) + date_list_part3 = self.generate_date_list(start_date, end_date) + + start_date = datetime(2024, 11, 16) + end_date = datetime(2024, 11, 22) + date_list_part4 = self.generate_date_list(start_date, end_date) + + date_list = date_list_part1 + date_list_part2 + date_list_part3 + date_list_part4 + + file_list = [] + for date in date_list: + custom_filename = Path(f"/tmp/{date}.csv.gz") + file_list.append(custom_filename) + return file_list + + def test_filter_source_files(self): + filelist = self.generate_dummy_file_names() + epiweek_dict = defaultdict(list) + for file in filelist: + issue_dt = datetime.strptime(file.name.split(".")[0], "%Y%m%d") + issue_epiweek = Week.fromdate(issue_dt) + epiweek_dict[issue_epiweek].append(issue_dt) + patch_issue_list = filter_source_files(filelist) + for file in patch_issue_list: + issue_dt = datetime.strptime(file.name.split(".")[0], "%Y%m%d") + issue_epiweek = Week.fromdate(issue_dt) + assert max(epiweek_dict[issue_epiweek]) == issue_dt + + def generate_test_source_files(self): + start_date = datetime(2024, 8, 1) + end_date = datetime(2024, 8, 4) + date_list_part1 = self.generate_date_list(start_date, end_date) + + start_date = datetime(2024, 9, 6) + end_date = datetime(2024, 9, 10) + date_list_part2 = self.generate_date_list(start_date, end_date) + + start_date = datetime(2024, 11, 16) + end_date = datetime(2024, 11, 22) + date_list_part4 = self.generate_date_list(start_date, end_date) + + date_list = date_list_part1 + date_list_part2 + date_list_part4 + + file_list = [] + prelim_file_list = [] + for date in date_list: + custom_filename = f"{TEST_DIR}/backups/{date}.csv.gz" + custom_filename_prelim = f"{TEST_DIR}/backups/{date}_prelim.csv.gz" + test_data = pd.DataFrame(TEST_DATA) + test_data[TOTAL_ADMISSION_COVID_API] = int(date) + test_data[TOTAL_ADMISSION_FLU_API] = int(date) + test_prelim_data = pd.DataFrame(PRELIM_TEST_DATA) + test_prelim_data[TOTAL_ADMISSION_COVID_API] = int(date) + test_prelim_data[TOTAL_ADMISSION_FLU_API] = int(date) + + test_data = test_data.head(2) + test_data.to_csv( + custom_filename, index=False, na_rep="NA", compression="gzip" + ) + test_prelim_data = test_data.head(2) + test_prelim_data.to_csv( + custom_filename_prelim, index=False, na_rep="NA", compression="gzip" + ) + file_list.append(custom_filename) + prelim_file_list.append(custom_filename_prelim) + return file_list, prelim_file_list + + def test_patch(self, params_w_patch): + with mock_patch("delphi_nhsn.patch.read_params", return_value=params_w_patch): + file_list, prelim_file_list = self.generate_test_source_files() + patch(params_w_patch) + + for issue_path in Path(f"{TEST_DIR}/patch_dir").glob("*"): + issue_dt_str = issue_path.name.replace("issue_", "") + for file in Path(issue_path / "nhsn").iterdir(): + df = pd.read_csv(file) + val = Week.fromdate(datetime.strptime(str(int(df["val"][0])), "%Y%m%d")) + assert issue_dt_str == str(val) + + # clean up + shutil.rmtree(f"{TEST_DIR}/patch_dir") + + for file in file_list: + os.remove(file) + + for file in prelim_file_list: + os.remove(file) + + + + + diff --git a/nhsn/tests/test_pull.py b/nhsn/tests/test_pull.py index c09c838d7..daa3acd92 100644 --- a/nhsn/tests/test_pull.py +++ b/nhsn/tests/test_pull.py @@ -8,13 +8,12 @@ from delphi_nhsn.pull import ( pull_nhsn_data, pull_data, - pull_preliminary_nhsn_data + pull_preliminary_nhsn_data, pull_data_from_file ) from delphi_nhsn.constants import SIGNALS_MAP, PRELIM_SIGNALS_MAP from delphi_utils import get_structured_logger -from conftest import TEST_DATA, PRELIM_TEST_DATA - +from conftest import TEST_DATA, PRELIM_TEST_DATA, TEST_DIR DATASETS = [{"id":"ua7e-t2fy", "test_data": TEST_DATA}, @@ -42,16 +41,46 @@ def test_socrata_call(self, mock_socrata, dataset, params): # Check that get method was called with correct arguments mock_client.get.assert_any_call(dataset["id"], limit=50000, offset=0) + def test_pull_from_file(self, caplog, params_w_patch): + backup_dir = f"{TEST_DIR}/test_data" + issue_date = params_w_patch["patch"]["issue_date"] + logger = get_structured_logger() + + # Load test data + expected_data = pd.DataFrame(TEST_DATA) + + df = pull_data_from_file(backup_dir, issue_date, logger=logger) + df = df.astype('str') + expected_data = expected_data.astype('str') + assert "Pulling data from file" in caplog.text + + pd.testing.assert_frame_equal(expected_data, df) + + def test_pull_from_file_prelim(self, caplog, params_w_patch): + backup_dir = f"{TEST_DIR}/test_data" + issue_date = params_w_patch["patch"]["issue_date"] + logger = get_structured_logger() + + # Load test data + expected_data = pd.DataFrame(PRELIM_TEST_DATA) + + df = pull_data_from_file(backup_dir, issue_date, logger=logger, prelim_flag=True) + df = df.astype('str') + expected_data = expected_data.astype('str') + + assert "Pulling data from file" in caplog.text + pd.testing.assert_frame_equal(expected_data, df) + def test_pull_nhsn_data_output(self, caplog, params): with patch('sodapy.Socrata.get') as mock_get: mock_get.side_effect = [TEST_DATA, []] backup_dir = params["common"]["backup_dir"] test_token = params["indicator"]["socrata_token"] - custom_run = True + custom_run = params["common"]["custom_run"] logger = get_structured_logger() - result = pull_nhsn_data(test_token, backup_dir, custom_run, logger) + result = pull_nhsn_data(test_token, backup_dir, custom_run, issue_date=None, logger=logger) # Check result assert result["timestamp"].notnull().all(), "timestamp has rogue NaN" @@ -74,7 +103,7 @@ def test_pull_nhsn_data_backup(self, caplog, params): logger = get_structured_logger() # Call function with test token - pull_nhsn_data(test_token, backup_dir, custom_run, logger) + pull_nhsn_data(test_token, backup_dir, custom_run, issue_date=None, logger=logger) # Check logger used: assert "Backup file created" in caplog.text @@ -99,11 +128,11 @@ def test_pull_prelim_nhsn_data_output(self, caplog, params): mock_get.side_effect = [PRELIM_TEST_DATA, []] backup_dir = params["common"]["backup_dir"] test_token = params["indicator"]["socrata_token"] - custom_run = True + custom_run = params["common"]["custom_run"] logger = get_structured_logger() - result = pull_preliminary_nhsn_data(test_token, backup_dir, custom_run, logger) + result = pull_preliminary_nhsn_data(test_token, backup_dir, custom_run, issue_date=None, logger=logger) # Check result assert result["timestamp"].notnull().all(), "timestamp has rogue NaN" @@ -126,7 +155,7 @@ def test_pull_prelim_nhsn_data_backup(self, caplog, params): logger = get_structured_logger() # Call function with test token - pull_preliminary_nhsn_data(test_token, backup_dir, custom_run, logger) + pull_preliminary_nhsn_data(test_token, backup_dir, custom_run, issue_date=None, logger=logger) # Check logger used: assert "Backup file created" in caplog.text diff --git a/nhsn/tests/test_run.py b/nhsn/tests/test_run.py index cfc47e50f..c96ec7953 100644 --- a/nhsn/tests/test_run.py +++ b/nhsn/tests/test_run.py @@ -1,3 +1,4 @@ +import glob import os from pathlib import Path @@ -47,3 +48,8 @@ def test_output_files_exist(self, params, run_as_module): for file in Path(export_dir).glob("*.csv"): os.remove(file) + + today = pd.Timestamp.today().strftime("%Y%m%d") + backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*") + for file in backup_dir: + os.remove(file) From 049de70e961c39409d48cd6a75775c76aefa46da Mon Sep 17 00:00:00 2001 From: aysim319 Date: Wed, 15 Jan 2025 16:56:00 -0500 Subject: [PATCH 2/7] issue date must be 8 digits for patching (#2100) * issue date must be 8 digits * fixed test * suggested changes --- nhsn/delphi_nhsn/patch.py | 11 ++++++----- nhsn/tests/test_patch.py | 3 +-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/nhsn/delphi_nhsn/patch.py b/nhsn/delphi_nhsn/patch.py index 31fae3070..cefb0e564 100644 --- a/nhsn/delphi_nhsn/patch.py +++ b/nhsn/delphi_nhsn/patch.py @@ -78,11 +78,12 @@ def patch(params): patch_list = filter_source_files(source_files) for file in patch_list: - issue_date = datetime.strptime(file.name.split(".")[0], "%Y%m%d") - current_issue_ew = Week.fromdate(issue_date) - logger.info("Running issue", issue_date=issue_date.strftime("%Y-%m-%d")) - params["patch"]["issue_date"] = issue_date.strftime("%Y%m%d") - current_issue_dir = f"{params['patch']['patch_dir']}/issue_{current_issue_ew}/nhsn" + issue_date_str = file.name.split(".")[0] + logger.info("Running issue", issue_date=datetime.strptime(issue_date_str, "%Y%m%d").strftime("%Y-%m-%d")) + params["patch"]["issue_date"] = issue_date_str + # regardless of week date type or not the directory name must be issue_date_YYYYMMDD + # conversion in done in acquisition + current_issue_dir = f"{params['patch']['patch_dir']}/issue_{issue_date_str}/nhsn" makedirs(current_issue_dir, exist_ok=True) params["common"]["export_dir"] = current_issue_dir params["common"]["custom_run"] = True diff --git a/nhsn/tests/test_patch.py b/nhsn/tests/test_patch.py index 639d5d262..066ef4736 100644 --- a/nhsn/tests/test_patch.py +++ b/nhsn/tests/test_patch.py @@ -112,8 +112,7 @@ def test_patch(self, params_w_patch): issue_dt_str = issue_path.name.replace("issue_", "") for file in Path(issue_path / "nhsn").iterdir(): df = pd.read_csv(file) - val = Week.fromdate(datetime.strptime(str(int(df["val"][0])), "%Y%m%d")) - assert issue_dt_str == str(val) + assert issue_dt_str == str(int(df["val"][0])) # clean up shutil.rmtree(f"{TEST_DIR}/patch_dir") From 5b413112f9eb37904c68bde4f8f91713dccf9ac6 Mon Sep 17 00:00:00 2001 From: aysim319 Date: Wed, 22 Jan 2025 09:42:33 -0500 Subject: [PATCH 3/7] clean state column (#2105) * in progress * filtering also on hhs and check in test * regenerated test data to update current behavior * lint --- nhsn/delphi_nhsn/run.py | 5 +- nhsn/tests/test_data/20241212.csv.gz | Bin 3446 -> 3467 bytes nhsn/tests/test_data/page.json | 118 +++++++++++++++++++++++++++ nhsn/tests/test_run.py | 3 + 4 files changed, 125 insertions(+), 1 deletion(-) diff --git a/nhsn/delphi_nhsn/run.py b/nhsn/delphi_nhsn/run.py index 15f5559c5..92e24bbda 100644 --- a/nhsn/delphi_nhsn/run.py +++ b/nhsn/delphi_nhsn/run.py @@ -73,14 +73,17 @@ def run_module(params, logger=None): df = df[df["geo_id"] == "us"] elif geo == "hhs": df = df[df["geo_id"] != "us"] + df = df[df["geo_id"].str.len() == 2] df.rename(columns={"geo_id": "state_id"}, inplace=True) df = geo_mapper.add_geocode(df, "state_id", "state_code", from_col="state_id") df = geo_mapper.add_geocode(df, "state_code", "hhs", from_col="state_code", new_col="hhs") df = geo_mapper.replace_geocode( df, from_col="state_code", from_code="state_code", new_col="geo_id", new_code="hhs" ) - else: + elif geo == "state": df = df[df_pull["geo_id"] != "us"] + df = df[df["geo_id"].str.len() == 2] # hhs region is a value in geo_id column + df["se"] = np.nan df["sample_size"] = np.nan dates = create_export_csv( diff --git a/nhsn/tests/test_data/20241212.csv.gz b/nhsn/tests/test_data/20241212.csv.gz index 61c85a5ba35e757dd53d819a9854aec78d534a17..26f91c200dfd1cae144458dfe91b0a6cf767d596 100644 GIT binary patch literal 3467 zcmV;64RrD!iwFop3y^05|1vN#H83$UF)m|sb^z>}$!_Jy5k>d@3V*;8kr6q2XZQoW zu)Q{_C3Sc8APH3}_wW1O$mIBvQfVOr1_Ve1+vm@Z50{5;zkhysdj8@5?(25{)7`h2FvyPcpn?8aZ!q2I&(AD`}i-=A!&vpJyl>cuc;B!p zc=$`JiigdslF!a_Q3yJmDS13%d^Y3x_8K0K7@z)lzP)zCKFVti4_Y@m z9<*+>d@}R3w5~LLGVfTF*T44Zhp)Fk|8%PGvA{QjM)#&SpNak7aw_+2(?Rgtri0{9 z#eORDQ-L3TWh=VuiTF5z}3`H{!wFVw)X&0T7H;qNp7+wd~nc&ogxYpH- zpTv=Re~G@P!Ny-gz;(N2KXPBQHEP>XhrPwMp0A9@F9(i0D@x)C=RB)tu1IOoQ<2sX zaSNvsS9RuT>9wfyt<$@FvXQXuA@!tx|Ci&5;I`*MvCg{G)V*}e0DfNMgcGl>h*1x6 z!9*FeQBMXx>MCVcOUXiNLEB0_`K%3psK#&B93~axA37(DiHalk1i~h1m#)xYD05+Y ztfZhLW{aZ&ZI7?dMLO=PUEFeOQK2?lhztLs!NrIC!N2$9Um5T(q~h#prW{KNBwLgW ziw5b$>;o@d`M&!Yx#zvZ_2YdLsZq-$V*K;0QUC|07ElA-U?bxtt0b6WN~u;q`2yLB zD|yn)s$`mT8j3evl`_G+W(8_vSa&cfbgscr2l1KQDyF9RB8@!`L##%?q@93vrv*CS zATW7~PPKF{Mq^+z<~VmUDh@UZZqY{feP2sspfyU{7z2IHD+6B{Z#+8L8qabLvJ$G* z_!WS;cl8E9zXhNV*8qIO>^Hm{FKxB#`}vb$^9itv?oR2(ZoiJp&Gbv1-$=RMxG>@& zTsZPjl5q?6j=;;39IF#74>1V?PB>Lxjeo(JA4j4QjxMuLg&x@$Lj$V~l*BGYgiSUj zxd$rt!e;Qryh19mni9>cbnQnP|Ek!1&+U;p^Ziohc!{aknVCv<%ka*8K=Hkxc&(%c z#ZcgN85Dz-#HR}oLO3|zn5DvI;uVVs9iRppQ?Ll$8<`0Ap6(q9n7E@ZVXDN3s2=*n zqcC4B6MzMF)>F9-HxoBOqMEwIg#;@eNn1LJ2y<#7eU}{|5qMP^YBDdCYEEIyWMQa| z(yLyBm{Yh%VsIM|FlX~P6mI1fiOnOywZGsNb`19YGq;fa$8O>E`0sQJR|qaO@;!c` z_uuCizA%4-+SP`Liffm{;cl}*<&|%!hvH^=B5qK%Xjn#}v2WC>LiY`BAr(U!LK>u@ zZxORc#6q@}++$;7Cd`=mj9yJgjC^pEt!s(qby~DrvY+e@aVB&6+SV#ed+_PZICJ03 zv}sX$Y}_75SqxFeSyQ2Tw$JuhVzC<%Vl^oegDp7rWg8s#=-AH| zxMQ$349W|hq31JO_EW=Pxbh71nP+%C{yRKF=MTPHs(JMc^NDAe_$0r>Gd!LBRnO2L zT*1w8ICTyE_q&Fh`3`lfjz_UF6~a|Mjh%*eKYVbENm6o=rH`q>P_&22(m>W|Xm&JG zr!eTs6&Y6JCBXfGj5(x2?R`;z@e)jHFDzCL21xS_{Q|@omut^G`Y&mzXntXh3n+4E zl8o{J!}o;YFjn>qG2jaThS_IuEs8{hsMyIF`?gf*9o>Ce#6aYPhC@h;%Ns7ke*@ao zRR^+>hrzR@@emM_;Kfb!UKkaVh5>R^+}K!P(z&98FuigWMj&P|b=gU|El+1UgNkTE z?4{3@SBC)99!PCrv!=E_lkslbQeoY}&|8?C`zlp32wb2`_taR2b|iT=hzW4~LGc!r)_d zp~VdxFcNtgO68YDdq*j8>E@mP#N}ybtg8B;vHoW zQ(6`z#cYW=pE6Bs0@Yd)FLdzP+eLtH7YHpFmX?rPyucQ{6$4csYyv$uZSV>U$cRsi zZf(=yS62AFGTxZu&kK7U^7?+v6=kL064jD^X-mH{px+y2f7uV@UEPPd^_2y}WyEVv zpyftn*TsrWx>ss zlzyq1RBRbz7*)m-Ox#uYg_f1(+9EwRJgtx!QUinhaDj&`XD7ZNVJ!On{poPK9>TL#{4S{RQDz${N z_u&J9?}@-VK%i%2%SF7DxV&#>4UsKY zlabX1xRx+(lLa&Uy)X`1qS)IcVxh22q+T}5flYWd$2C)&gC)46x)p`BeLmc!NMmndXBZR}Fy$;$|MYm+(rxebrzd zp+u8g2#GH|WNNv%5Ub_>n6F+x9dtR%7KFAiBRvz>+{1$>IL;IYFB?zJrL#prmq$tB z;rQ2*#LY8{_2{d`99~}+G`IZaB=KwW-`Gmucne6$ygs-dA5tXqb|Ionng`KwK9r4s tqSq;swt#V}eSUoSvwX<;L(cz4a{ey>009600|55l2_^kH004`mwp#!I literal 3446 zcmV-+4T}$#Uh!5k>dZ*hsUR%@9(~E_dnhJ`0})^c)Y*c9)7*OJse(*K;1m(_Qx`R z+g&@i<9)|@^5y8*eVzLD>%%X%Qh)pU{qgbB(>m{=p1q&GK7IOfs9!68`SN3_T*u@0 z)@ipB^oHH|t2*?1xc~Fh-5>jtZFM%Ml&7zu4n`a=q}Bd-_t^IQ~y4rfXpj~JiLc)q=c$0Nq4Kb~)|-tmZUw`1)dTZWJFTEm0ZjgAMc z8!eyAd@ZdjO`ps=7UlJ?efs(P?Qg%HDts*P&7je}>CI~C_;UHi#RPhWw}&W-K6!BgIeKSn1a6~AN=Gl8ZtXcNd-sZ zed3`{R)|JFNx0M`))+ZK#B`k2Cc*Ea-EktXtmy@7d|-9u&?1~&xfi*0_MeI89->!s@MFQe;Px?YPxt;Sv4NngyW)xS{WlIwg9*EO!n+)#C@zSr}o zr=^$MkAK7b|F{2B1^w$Ef4$`BB;xeyCFL4}o1$DIBoZg5mzdQD>_HHk@N_207q3O! z`FJU{KvY^F#8~6xp|T*NoU&?$WHn$o1}Eq)IQ)JoP?16# z5NqPymURbb5K%ALB$$1|89@so3 zQtvO(*EHDpO9;4bx9msmYqmyh8|tvPxYqN9@%ZJyac4zIJmH*Y^~@D1EqW@_8X|7t zRN|`6JT1Kzb-s0amrphlwmqbt^zZ+2JQ3XXJSf&#mzuhlZW+MOYn*W6)fF-7K`xjm zV>ar^;747h%xWoFNG)hvsVASc;SbgL&6>lcV*EqrgfUTZ#GXLdB<<1_8VqGFOplcm zbi{0NRG{tg^|?sLUA2o_ZY?U*W(#rQUo^P*kU#kMp8P8V{)JSWJ2*l3~#x zotS;#r7PceA0zj?cesAMZz45nnM90#o>dCqz|;b2pc`ytykwOGQ%ot<>L*_yTX7{% znpu@hb529?rmIpWnAfa8Z4B!UCWX#580sKClUv2q6knvV$6<)o2$-}J(C)NA=Nkkj zPtmEC&c$d9Y{netPDaJSM!_xG=)Ui3X$-VRX&YmpuX$zQ3*(JPCtKrL&Ouf}wHm(y zF!!$B0O+>>^x+zSZZ|cDIP>F3G{Vtk_NmY#8)Il-wSkh@rHHV}rX=@3 z#a`G9zL-}?C00|Sd6lmHNaJ4>yYIO@GH1SD${a5-^*S?C$!;0mnGYzw7Zk6R)Sws& zye@-c(31Fc0YV4|=Nq$B*i5`)5upRrKw}CP!FwYU;oj4|BLNe4)Fn)n_z=}YpLi7J zt7QVPz|MLq*WqU3CP-9Mm$;B%#Up7;ClO&zEu`1$i7FzvyoGvmyCGt;I; z?XhutAZ0N`8D~v}=Gi{mV~NFXNRST_z9$J~6dnhqfwb^gJ^U~84gnoF^)Z!#SQKue zYNT_BhYt{S-%#P4DP2PcLeewLkRyyUKojWG;gu=~jX^;WwkgVqfVJQd?5=jtFax>; zfOU0(O7L={FR423;CDS2cv4&EvDmoa%q%tz77dpoW{K6LNDQ{%*q3c^+@oVZTi}kt z+At{3c!r+OY}ro@gW<|E%x9kA_4rqKhRz>+w^Z}$8RiquF!4#g!ZSRb{YB5vA6&uB zaX57i{p(%B&3uQtRmY=PnF`@5pT9{rcJR5ZV^#sw5PG)YGJ zfZ=<>a2P9lh8XY#0K@DvxE4hsLR9SJjD1@w^p5U6En*;YLc<}X#pMl`;lBZG>Z$|T z$iv{-(s&36N$}z(dM}KMNy7j+DsF5nFzH;;L6}~-3L_9Rn7Ztw+?J;^ok2x3A@FY7eBguvt@EpUHT)ZK<&CVCXGO&V7|Cner?eA*jk=69KQ3X1rlv8VId1v`qtH z(iMa+j5jdFX9{Ce*`W;AyKx3XgIF;VmMsbMz!`kYJbV&;pQhR=0 z^n-lwNWPx7p*I4TTw6s~%u!G}Y~RblY4y3pbV z6-PXwMg+lbf)1UMck~0nI|wNNbledx|G~#Au@v(H;SJJEYp4u+!@3ijM)8g^h$$@# zl47>RoKKl1Hi2p_i5EKf?Cm1Jw+n<83`p0Gn>Kib1!TmhMYp!; z@GC3)UKnr8@#lrT4tae)=8Ce?Z;5J2zqF-a8PM+yvp??#@~-Z~-1@=-;WFYSC(v>u zvg=~SCS4-##_0%y@Yle*D%E(G8pARJ6L?uEDpO=&Rc_&}Ye{4JfL!5m(jsEOuG*4K zwFsrTm4!%!wvC)zMW1~@F}nA{tZ-dXsM0UCX176M3rk9vZ++J01ZGKlOXVV8qYo6m z4+;k|YAAFPO0%Oi06feS!!p}ya!ipz@ysEY5Q05?4-bb~VZhKylS^fWjDm61drH65 zOe(gFF^nqX2`26;{6fo0b8V3x8=h9k45@)Zez?HIFy(_6b=irw=A42pKS%JES&|ft zdtMWZ2zgt`9Q~XxwvQ6}i`0y7PA$n!ub4|Y%zh)254i7U$1hVkOy@n3JPCT`sn6{YJ9xpRvWN9%DR~HGI z_F;^WtCb&+57N; z!1qL89U#y%vgIOPN?hJIvxdl)zFCab$70KbnwZ}O{{qY%lN*wNNBcgeB<9J$F2@gL z$iEDyZ{oKGDK^>x5-`{NhKn9#;$k2IW(3M96p2Ko8E(BWwsyAYIPXNW^H8d9OWLOT z=zO*?gv51R#SF&7&^4E|Yi{?&jJ^J{PmNd#8G#pmDBzA9H$)ehl`qRa%{Ta0wi%GV zCRk_wy*S>OQ=ch(<5AxGbMWB8%D*zBivZLu0o}kKeB=D*sSzo=)QG;68c}&=YGj>w z%^k;`ekF}K*pLQK&;kQk#Kytc{KDFjyV^dGDU41ir_j>JoqN7zcOe)S*}`H*$7E!+ z0j?#C+hoBEe=m%KmMHc%iC8FX6RDTYa$pl)&2h~X=U@pgscuDKZJ!UikIx6+6NN`9 z844ASo2?2bI7>iS;F>?)US@n_OhxQ_0ytraj_KC Date: Wed, 22 Jan 2025 10:37:50 -0500 Subject: [PATCH 4/7] prevent sircal from alerting on NSSP secondary signals (#2098) * prevent sircal from alerting on NSSP secondary signals * JSON doesnt like trailing commas --- ansible/templates/sir_complainsalot-params-prod.json.j2 | 8 +++++++- sir_complainsalot/params.json.template | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index 7bb2d179a..0dc66608d 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -44,7 +44,13 @@ }, "nssp": { "max_age":19, - "maintainers": [] + "maintainers": [], + "retired-signals": [ + "pct_ed_visits_combined_2023rvr", + "pct_ed_visits_covid_2023rvr", + "pct_ed_visits_influenza_2023rvr", + "pct_ed_visits_rsv_2023rvr" + ] }, "nhsn": { "max_age":19, diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template index 64c4bee17..cf784774f 100644 --- a/sir_complainsalot/params.json.template +++ b/sir_complainsalot/params.json.template @@ -44,7 +44,13 @@ }, "nssp": { "max_age":19, - "maintainers": [] + "maintainers": [], + "retired-signals": [ + "pct_ed_visits_combined_2023rvr", + "pct_ed_visits_covid_2023rvr", + "pct_ed_visits_influenza_2023rvr", + "pct_ed_visits_rsv_2023rvr" + ] } } } From eba4ea2b4f68f99b889da28bbce6c521bd896dd0 Mon Sep 17 00:00:00 2001 From: minhkhul <118945681+minhkhul@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:44:02 -0500 Subject: [PATCH 5/7] Delete secondary nssp signals (#2101) * delete secondary * remove docs * Update nssp/DETAILS.md Co-authored-by: george * Update nssp/DETAILS.md Co-authored-by: george --------- Co-authored-by: george --- nssp/DETAILS.md | 17 ++--------- nssp/README.md | 4 +-- nssp/delphi_nssp/constants.py | 26 ----------------- nssp/delphi_nssp/pull.py | 54 ----------------------------------- nssp/delphi_nssp/run.py | 51 ++------------------------------- nssp/tests/test_pull.py | 44 ---------------------------- 6 files changed, 5 insertions(+), 191 deletions(-) diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md index 692d85559..9bac16879 100644 --- a/nssp/DETAILS.md +++ b/nssp/DETAILS.md @@ -2,29 +2,16 @@ We import the NSSP Emergency Department Visit data, including percentage and smoothed percentage of ER visits attributable to a given pathogen, from the CDC website. The data is provided at the county level, state level and national level; we do a population-weighted mean to aggregate from county data up to the HRR and MSA levels. -There are 2 sources we grab data from for nssp: -- Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview -- Secondary (2023RVR) source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview -There are 8 signals output from the primary source and 4 output from secondary. There are no smoothed signals from secondary source. - -Note that the data produced from secondary source are mostly the same as their primary source equivalent, with past analysis shows around 95% of datapoints having less than 0.1 value difference and the other 5% having a 0.1 to 1.2 value difference. +NSSP source data: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview ## Geographical Levels -Primary source: * `state`: reported from source using two-letter postal code * `county`: reported from source using fips code * `national`: just `us` for now, reported from source * `hhs`, `hrr`, `msa`: not reported from source, so we computed them from county-level data using a weighted mean. Each county is assigned a weight equal to its population in the last census (2020). -Secondary (2023RVR) source: -* `state`: reported from source -* `hhs`: reported from source -* `national`: reported from source - ## Metrics * `percent_visits_covid`, `percent_visits_rsv`, `percent_visits_influenza`: percentage of emergency department patient visits for specified pathogen. * `percent_visits_combined`: sum of the three percentages of visits for flu, rsv and covid. * `smoothed_percent_visits_covid`, `smoothed_percent_visits_rsv`, `smoothed_percent_visits_influenza`: 3 week moving average of the percentage of emergency department patient visits for specified pathogen. -* `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. -* `percent_visits_covid_2023RVR`, `percent_visits_rsv_2023RVR`, `percent_visits_influenza_2023RVR`: Taken from secondary source, percentage of emergency department patient visits for specified pathogen. -* `percent_visits_combined_2023RVR`: Taken from secondary source, sum of the three percentages of visits for flu, rsv and covid. +* `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. \ No newline at end of file diff --git a/nssp/README.md b/nssp/README.md index c3f57b94b..d062771c0 100644 --- a/nssp/README.md +++ b/nssp/README.md @@ -2,9 +2,7 @@ We import the NSSP Emergency Department Visit data, currently only the smoothed concentration, from the CDC website, aggregate to the state and national level from the wastewater sample site level, and export the aggregated data. -There are 2 sources we grab data from for nssp: -- Primary source: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview -- Secondary source: https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview +NSSP source data: https://data.cdc.gov/Public-Health-Surveillance/NSSP-Emergency-Department-Visit-Trajectories-by-St/rdmq-nq56/data_preview For details see the `DETAILS.md` file in this directory. diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py index 76d51b927..9b98d2012 100644 --- a/nssp/delphi_nssp/constants.py +++ b/nssp/delphi_nssp/constants.py @@ -41,29 +41,3 @@ "fips": str, } ) - -SECONDARY_COLS_MAP = { - "week_end": "timestamp", - "geography": "geo_value", - "percent_visits": "val", - "pathogen": "signal", -} - -SECONDARY_SIGNALS_MAP = { - "COVID-19": "pct_ed_visits_covid_2023RVR", - "Influenza": "pct_ed_visits_influenza_2023RVR", - "RSV": "pct_ed_visits_rsv_2023RVR", - "Combined": "pct_ed_visits_combined_2023RVR", -} - -SECONDARY_SIGNALS = [val for (key, val) in SECONDARY_SIGNALS_MAP.items()] -SECONDARY_GEOS = ["state", "nation", "hhs"] - -SECONDARY_TYPE_DICT = { - "timestamp": "datetime64[ns]", - "geo_value": str, - "val": float, - "geo_type": str, - "signal": str, -} -SECONDARY_KEEP_COLS = [key for (key, val) in SECONDARY_TYPE_DICT.items()] diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py index 94058dea8..8fcc4da09 100644 --- a/nssp/delphi_nssp/pull.py +++ b/nssp/delphi_nssp/pull.py @@ -10,10 +10,6 @@ from .constants import ( NEWLINE, - SECONDARY_COLS_MAP, - SECONDARY_KEEP_COLS, - SECONDARY_SIGNALS_MAP, - SECONDARY_TYPE_DICT, SIGNALS, SIGNALS_MAP, TYPE_DICT, @@ -96,53 +92,3 @@ def pull_nssp_data(socrata_token: str, backup_dir: str, custom_run: bool, logger keep_columns = ["timestamp", "geography", "county", "fips"] return df_ervisits[SIGNALS + keep_columns] - - -def secondary_pull_nssp_data( - socrata_token: str, backup_dir: str, custom_run: bool, logger: Optional[logging.Logger] = None -): - """Pull the latest NSSP ER visits secondary dataset. - - https://data.cdc.gov/Public-Health-Surveillance/2023-Respiratory-Virus-Response-NSSP-Emergency-Dep/7mra-9cq9/data_preview - - The output dataset has: - - - Each row corresponds to a single observation - - Parameters - ---------- - socrata_token: str - My App Token for pulling the NSSP data (could be the same as the nchs data) - - Returns - ------- - pd.DataFrame - Dataframe as described above. - """ - socrata_results = pull_with_socrata_api(socrata_token, "7mra-9cq9") - df_ervisits = pd.DataFrame.from_records(socrata_results) - create_backup_csv(df_ervisits, backup_dir, custom_run, sensor="secondary", logger=logger) - df_ervisits = df_ervisits.rename(columns=SECONDARY_COLS_MAP) - - # geo_type is not provided in the dataset, so we infer it from the geo_value - # which is either state names, "National" or hhs region numbers - df_ervisits["geo_type"] = "state" - - df_ervisits.loc[df_ervisits["geo_value"] == "National", "geo_type"] = "nation" - - hhs_region_mask = df_ervisits["geo_value"].str.lower().str.startswith("region ") - df_ervisits.loc[hhs_region_mask, "geo_value"] = df_ervisits.loc[hhs_region_mask, "geo_value"].str.replace( - "Region ", "" - ) - df_ervisits.loc[hhs_region_mask, "geo_type"] = "hhs" - - df_ervisits["signal"] = df_ervisits["signal"].map(SECONDARY_SIGNALS_MAP) - - df_ervisits = df_ervisits[SECONDARY_KEEP_COLS] - - try: - df_ervisits = df_ervisits.astype(SECONDARY_TYPE_DICT) - except KeyError as exc: - raise ValueError(warn_string(df_ervisits, SECONDARY_TYPE_DICT)) from exc - - return df_ervisits diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py index 417c49ab2..7e069e483 100644 --- a/nssp/delphi_nssp/run.py +++ b/nssp/delphi_nssp/run.py @@ -31,8 +31,8 @@ from delphi_utils.geomap import GeoMapper from delphi_utils.nancodes import add_default_nancodes -from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SECONDARY_GEOS, SECONDARY_SIGNALS, SIGNALS -from .pull import pull_nssp_data, secondary_pull_nssp_data +from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SIGNALS +from .pull import pull_nssp_data def add_needed_columns(df, col_names=None): @@ -141,52 +141,5 @@ def run_module(params): if len(dates) > 0: run_stats.append((max(dates), len(dates))) - logger.info("Generating secondary signals") - secondary_df_pull = secondary_pull_nssp_data(socrata_token, backup_dir, custom_run, logger) - for signal in SECONDARY_SIGNALS: - secondary_df_pull_signal = secondary_df_pull[secondary_df_pull["signal"] == signal] - if secondary_df_pull_signal.empty: - logger.warning("No data found for signal", signal=signal) - continue - for geo in SECONDARY_GEOS: - df = secondary_df_pull_signal.copy() - logger.info("Generating signal and exporting to CSV", geo_type=geo, signal=signal) - if geo == "state": - df = df[(df["geo_type"] == "state")] - df["geo_id"] = df["geo_value"].apply( - lambda x: ( - us.states.lookup(x).abbr.lower() - if us.states.lookup(x) - else ("dc" if x == "District of Columbia" else x) - ) - ) - unexpected_state_names = df[df["geo_id"] == df["geo_value"]] - if unexpected_state_names.shape[0] > 0: - logger.error( - "Unexpected state names", - unexpected_state_names=unexpected_state_names["geo_value"].unique(), - ) - raise RuntimeError - elif geo == "nation": - df = df[(df["geo_type"] == "nation")] - df["geo_id"] = "us" - elif geo == "hhs": - df = df[(df["geo_type"] == "hhs")] - df["geo_id"] = df["geo_value"] - # add se, sample_size, and na codes - missing_cols = set(CSV_COLS) - set(df.columns) - df = add_needed_columns(df, col_names=list(missing_cols)) - df_csv = df[CSV_COLS + ["timestamp"]] - # actual export - dates = create_export_csv( - df_csv, - geo_res=geo, - export_dir=export_dir, - sensor=signal, - weekly_dates=True, - ) - if len(dates) > 0: - run_stats.append((max(dates), len(dates))) - ## log this indicator run logging(start_time, run_stats, logger) diff --git a/nssp/tests/test_pull.py b/nssp/tests/test_pull.py index 30debd6cd..a03221019 100644 --- a/nssp/tests/test_pull.py +++ b/nssp/tests/test_pull.py @@ -7,16 +7,11 @@ from delphi_nssp.pull import ( pull_nssp_data, - secondary_pull_nssp_data, pull_with_socrata_api, ) from delphi_nssp.constants import ( NEWLINE, - SECONDARY_COLS_MAP, - SECONDARY_KEEP_COLS, - SECONDARY_SIGNALS_MAP, - SECONDARY_TYPE_DICT, SIGNALS, SIGNALS_MAP, TYPE_DICT, @@ -81,44 +76,5 @@ def test_pull_nssp_data(self, mock_socrata, caplog): for file in backup_files: os.remove(file) - @patch("delphi_nssp.pull.Socrata") - def test_secondary_pull_nssp_data(self, mock_socrata): - today = pd.Timestamp.today().strftime("%Y%m%d") - backup_dir = 'test_raw_data_backups' - - # Load test data - with open("test_data/secondary_page.txt", "r") as f: - test_data = json.load(f) - - # Mock Socrata client and its get method - mock_client = MagicMock() - mock_client.get.side_effect = [test_data, []] # Return test data on first call, empty list on second call - mock_socrata.return_value = mock_client - - custom_run = False - logger = get_structured_logger() - # Call function with test token - test_token = "test_token" - result = secondary_pull_nssp_data(test_token, backup_dir, custom_run, logger) - # print(result) - - # Check that Socrata client was initialized with correct arguments - mock_socrata.assert_called_once_with("data.cdc.gov", test_token) - - # Check that get method was called with correct arguments - mock_client.get.assert_any_call("7mra-9cq9", limit=50000, offset=0) - - for col in SECONDARY_KEEP_COLS: - assert result[col].notnull().all(), f"{col} has rogue NaN" - - assert result[result['geo_value'].str.startswith('Region') ].empty, "'Region ' need to be removed from geo_value for geo_type 'hhs'" - assert (result[result['geo_type'] == 'nation']['geo_value'] == 'National').all(), "All rows with geo_type 'nation' must have geo_value 'National'" - - # Check that backup file was created - backup_files = glob.glob(f"{backup_dir}/{today}*") - assert len(backup_files) == 2, "Backup file was not created" - for file in backup_files: - os.remove(file) - if __name__ == "__main__": unittest.main() From 6167352b52c1711842a0310d653632cca5b3af7f Mon Sep 17 00:00:00 2001 From: Delphi Deploy Bot Date: Wed, 22 Jan 2025 17:52:40 +0000 Subject: [PATCH 6/7] chore: bump covidcast-indicators to 0.3.59 --- .bumpversion.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 3016571af..c8fd22c0e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.58 +current_version = 0.3.59 commit = True message = chore: bump covidcast-indicators to {new_version} tag = False From 21350abb1256037331231794bb3e33394ca3cc7d Mon Sep 17 00:00:00 2001 From: minhkhul Date: Wed, 22 Jan 2025 17:52:41 +0000 Subject: [PATCH 7/7] [create-pull-request] automated change --- changehc/version.cfg | 2 +- claims_hosp/version.cfg | 2 +- doctor_visits/version.cfg | 2 +- google_symptoms/version.cfg | 2 +- hhs_hosp/version.cfg | 2 +- nchs_mortality/version.cfg | 2 +- nssp/version.cfg | 2 +- quidel_covidtest/version.cfg | 2 +- sir_complainsalot/version.cfg | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/changehc/version.cfg b/changehc/version.cfg index 7d679606f..e99046324 100644 --- a/changehc/version.cfg +++ b/changehc/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/claims_hosp/version.cfg b/claims_hosp/version.cfg index 7d679606f..e99046324 100644 --- a/claims_hosp/version.cfg +++ b/claims_hosp/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/doctor_visits/version.cfg b/doctor_visits/version.cfg index 7d679606f..e99046324 100644 --- a/doctor_visits/version.cfg +++ b/doctor_visits/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/google_symptoms/version.cfg b/google_symptoms/version.cfg index 7d679606f..e99046324 100644 --- a/google_symptoms/version.cfg +++ b/google_symptoms/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/hhs_hosp/version.cfg b/hhs_hosp/version.cfg index 7d679606f..e99046324 100644 --- a/hhs_hosp/version.cfg +++ b/hhs_hosp/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/nchs_mortality/version.cfg b/nchs_mortality/version.cfg index 7d679606f..e99046324 100644 --- a/nchs_mortality/version.cfg +++ b/nchs_mortality/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/nssp/version.cfg b/nssp/version.cfg index 7d679606f..e99046324 100644 --- a/nssp/version.cfg +++ b/nssp/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/quidel_covidtest/version.cfg b/quidel_covidtest/version.cfg index 7d679606f..e99046324 100644 --- a/quidel_covidtest/version.cfg +++ b/quidel_covidtest/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59 diff --git a/sir_complainsalot/version.cfg b/sir_complainsalot/version.cfg index 7d679606f..e99046324 100644 --- a/sir_complainsalot/version.cfg +++ b/sir_complainsalot/version.cfg @@ -1 +1 @@ -current_version = 0.3.58 +current_version = 0.3.59