From 40c68ec4e83f82a1790ec57001e3c9666ab5fd1f Mon Sep 17 00:00:00 2001 From: alex-dzeda <120701369+alex-dzeda@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:21:14 -0500 Subject: [PATCH] BCDA-8146: Make file server content-encoding aware (#948) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## đŸŽĢ Ticket https://jira.cms.gov/browse/BCDA-8146 ## 🛠 Changes Added magic number check for files, to determine if it is gzip encoded (or not). ## ℹī¸ Context for reviewers As part of BCDA's gzip epic, we are aiming to simultaneously decrease costs on EFS, while also increasing the overall throughput of files. When we switch over from primarily storing content uncompressed to primarily storing data gzip-encoded, there will be at least some period of time where both encodings are present. The BCDA application will now check the magic number (https://en.wikipedia.org/wiki/Magic_number_(programming)) to determine if content is encoded or not, and will handle the data appropriately (decompressing the file if an Accept-Encoding header of gzip is not present). There is a binary file, shared_files/gzip_feature_test/corrupt_gz_file.ndjson, that is not able to be opened with a normal text editor. It's advised to open files of this type in a hex editor for manual inspection. In this case, the file has the magic number of a gzip file, but the subsequent data is not compliant. ## ✅ Acceptance Validation Added unit tests to handle the various cases. ## 🔒 Security Implications - [ ] This PR adds a new software dependency or dependencies. - [ ] This PR modifies or invalidates one or more of our security controls. - [ ] This PR stores or transmits data that was not stored or transmitted before. - [ ] This PR requires additional review of its security implications for other reasons. If any security implications apply, add Jason Ashbaugh (GitHub username: StewGoin) as a reviewer and do not merge this PR without his approval. --- bcda/api/v1/api.go | 71 +++++++++++++++++- bcda/api/v1/api_test.go | 27 ++++--- .../gzip_feature_test/corrupt_gz_file.ndjson | Bin 0 -> 10 bytes .../gzip_feature_test/single_byte_file.bin | 1 + .../test_gzip_encoded.ndjson | Bin 0 -> 981 bytes .../gzip_feature_test/test_no_encoding.ndjson | 1 + 6 files changed, 86 insertions(+), 14 deletions(-) create mode 100644 shared_files/gzip_feature_test/corrupt_gz_file.ndjson create mode 100644 shared_files/gzip_feature_test/single_byte_file.bin create mode 100644 shared_files/gzip_feature_test/test_gzip_encoded.ndjson create mode 100644 shared_files/gzip_feature_test/test_no_encoding.ndjson diff --git a/bcda/api/v1/api.go b/bcda/api/v1/api.go index d6eed8c52..b48ef59f8 100644 --- a/bcda/api/v1/api.go +++ b/bcda/api/v1/api.go @@ -1,11 +1,14 @@ package v1 import ( + "bytes" "compress/gzip" "encoding/json" + "errors" "fmt" "io" "net/http" + "os" "strings" "time" @@ -284,7 +287,12 @@ func ServeData(w http.ResponseWriter, r *http.Request) { dataDir := conf.GetEnv("FHIR_PAYLOAD_DIR") fileName := chi.URLParam(r, "fileName") jobID := chi.URLParam(r, "jobID") - w.Header().Set(constants.ContentType, "application/fhir+ndjson") + filePath := fmt.Sprintf("%s/%s/%s", dataDir, jobID, fileName) + + encoded, err := isGzipEncoded(filePath) + if err != nil { + writeServeDataFailure(err, w) + } var useGZIP bool for _, header := range r.Header.Values("Accept-Encoding") { @@ -293,20 +301,75 @@ func ServeData(w http.ResponseWriter, r *http.Request) { break } } - + w.Header().Set(constants.ContentType, "application/fhir+ndjson") if useGZIP { w.Header().Set("Content-Encoding", "gzip") gz := gzip.NewWriter(w) defer gz.Close() gzw := gzipResponseWriter{Writer: gz, ResponseWriter: w} - http.ServeFile(gzw, r, fmt.Sprintf("%s/%s/%s", dataDir, jobID, fileName)) + if encoded { + http.ServeFile(w, r, filePath) + } else { + http.ServeFile(gzw, r, filePath) + } + } else { log.API.Warnf("API request to serve data is being made without gzip for file %s for jobId %s", fileName, jobID) - http.ServeFile(w, r, fmt.Sprintf("%s/%s/%s", dataDir, jobID, fileName)) + if encoded { + //We'll do the following: 1. Open file, 2. De-compress it, 3. Serve it up. + file, err := os.Open(filePath) // #nosec G304 + if err != nil { + writeServeDataFailure(err, w) + return + } + defer file.Close() //#nosec G307 + gzipReader, err := gzip.NewReader(file) + if err != nil { + writeServeDataFailure(err, w) + return + } + defer gzipReader.Close() + _, err = io.Copy(w, gzipReader) // #nosec G110 + if err != nil { + writeServeDataFailure(err, w) + return + } + } else { + http.ServeFile(w, r, filePath) + } } } +// This function is not necessary, but helps meet the sonarQube quality gates +func writeServeDataFailure(err error, w http.ResponseWriter) { + log.API.Error(err) + http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError) +} + +// This function reads a file's magic number, to determine if it is gzip-encoded or not. +func isGzipEncoded(filePath string) (encoded bool, err error) { + file, err := os.Open(filePath) // #nosec G304 + if err != nil { + return false, err + } + defer file.Close() //#nosec G307 + + byteSlice := make([]byte, 2) + bytesRead, err := file.Read(byteSlice) + if err != nil { + return false, err + } + + //We can't compare to a magic number if there's less than 2 bytes returned. Also can't be ndjson + if bytesRead < 2 { + return false, errors.New("Invalid file with length 1 byte.") + } + + comparison := []byte{0x1f, 0x8b} + return bytes.Equal(comparison, byteSlice), nil +} + /* swagger:route GET /api/v1/metadata metadata metadata diff --git a/bcda/api/v1/api_test.go b/bcda/api/v1/api_test.go index afd0d9aef..ebdfde967 100644 --- a/bcda/api/v1/api_test.go +++ b/bcda/api/v1/api_test.go @@ -354,28 +354,30 @@ func (s *APITestSuite) TestDeleteJob() { } func (s *APITestSuite) TestServeData() { - conf.SetEnv(s.T(), "FHIR_PAYLOAD_DIR", "../../../bcdaworker/data/test") + conf.SetEnv(s.T(), "FHIR_PAYLOAD_DIR", "../../../shared_files/gzip_feature_test/") tests := []struct { name string headers []string gzipExpected bool + fileName string + validFile bool }{ - {"gzip-only", []string{"gzip"}, true}, - {"gzip-deflate", []string{"gzip, deflate"}, true}, - {"gzip-deflate-br-handle", []string{"gzip,deflate, br"}, true}, - {"gzip", []string{"deflate", "br", "gzip"}, true}, - {"non-gzip w/ deflate and br", []string{"deflate", "br"}, false}, - {"non-gzip", nil, false}, + {"no-header-gzip-encoded", []string{""}, false, "test_gzip_encoded.ndjson", true}, + {"yes-header-gzip-encoded", []string{"gzip"}, true, "test_gzip_encoded.ndjson", true}, + {"yes-header-not-encoded", []string{"gzip"}, true, "test_no_encoding.ndjson", true}, + {"bad file name", []string{""}, false, "not_a_real_file", false}, + {"single byte file", []string{""}, false, "single_byte_file.bin", false}, + {"no-header-corrupt-file", []string{""}, false, "corrupt_gz_file.ndjson", false}, //This file is kind of cool. has magic number, but otherwise arbitrary data. } for _, tt := range tests { s.T().Run(tt.name, func(t *testing.T) { defer s.SetupTest() - req := httptest.NewRequest("GET", "/data/test.ndjson", nil) + req := httptest.NewRequest("GET", fmt.Sprintf("/data/%s", tt.fileName), nil) rctx := chi.NewRouteContext() - rctx.URLParams.Add("fileName", "test.ndjson") + rctx.URLParams.Add("fileName", tt.fileName) req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx)) var useGZIP bool @@ -390,6 +392,11 @@ func (s *APITestSuite) TestServeData() { handler := http.HandlerFunc(ServeData) handler.ServeHTTP(s.rr, req) + if !tt.validFile { + assert.Equal(t, http.StatusInternalServerError, s.rr.Code) + return + } + assert.Equal(t, http.StatusOK, s.rr.Code) assert.Equal(t, "application/fhir+ndjson", s.rr.Result().Header.Get(constants.ContentType)) @@ -407,7 +414,7 @@ func (s *APITestSuite) TestServeData() { b = s.rr.Body.Bytes() } - assert.Contains(t, string(b), `{"resourceType": "Bundle", "total": 33, "entry": [{"resource": {"status": "active", "diagnosis": [{"diagnosisCodeableConcept": {"coding": [{"system": "http://hl7.org/fhir/sid/icd-9-cm", "code": "2113"}]},`) + assert.Contains(t, string(b), `{"billablePeriod":{"end":"2016-10-30","start":"2016-10-30"},"contained":[{"birthDate":"1953-05-18","gender":"male","id":"patient","identifier":[{"system":"http://hl7.org/fhir/sid/us-mbi","type":{"coding":[{"code":"MC","display":"Patient's Medicare Number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"1S00E00KW61"}],"name":[{"family":"Quitzon246","given":["Rodolfo763"],"text":"Rodolfo763 Quitzon246 ([max 10 chars of first], [max 15 chars of last])"}],"resourceType":"Patient"},{"id":"provider-org","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/meda-prov-6","type":{"coding":[{"code":"PRN","display":"Provider number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"450702"},{"system":"https://bluebutton.cms.gov/resources/variables/fiss/fed-tax-nb","type":{"coding":[{"code":"TAX","display":"Tax ID number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"XX-XXXXXXX"},{"system":"http://hl7.org/fhir/sid/us-npi","type":{"coding":[{"code":"npi","display":"National Provider Identifier","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"8884381863"}],"resourceType":"Organization"}],"created":"2024-04-19T11:37:21-04:00","diagnosis":[{"diagnosisCodeableConcept":{"coding":[{"code":"P292","system":"http://hl7.org/fhir/sid/icd-10-cm"}]},"sequence":0}],"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd"}}],"facility":{"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd"}}]},"id":"f-LTEwMDE5NTYxNA","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/dcn","type":{"coding":[{"code":"uc","display":"Unique Claim ID","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"dcn-100195614"}],"insurance":[{"coverage":{"identifier":{"system":"https://bluebutton.cms.gov/resources/variables/fiss/payers-name","value":"MEDICARE"}},"focal":true,"sequence":0}],"meta":{"lastUpdated":"2023-04-13T18:18:27.334-04:00"},"patient":{"reference":"#patient"},"priority":{"coding":[{"code":"normal","display":"Normal","system":"http://terminology.hl7.org/CodeSystem/processpriority"}]},"provider":{"reference":"#provider-org"},"resourceType":"Claim","status":"active","supportingInfo":[{"category":{"coding":[{"code":"typeofbill","display":"Type of Bill","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBSupportingInfoType"}]},"code":{"coding":[{"code":"1","system":"https://bluebutton.cms.gov/resources/variables/fiss/freq-cd"}]},"sequence":1}],"total":{"currency":"USD","value":639.66},"type":{"coding":[{"code":"institutional","display":"Institutional","system":"http://terminology.hl7.org/CodeSystem/claim-type"}]},"use":"claim"}`) }) } diff --git a/shared_files/gzip_feature_test/corrupt_gz_file.ndjson b/shared_files/gzip_feature_test/corrupt_gz_file.ndjson new file mode 100644 index 0000000000000000000000000000000000000000..648bbdc615da950d0539bc103d4cdbf229a0d927 GIT binary patch literal 10 Lcmb2|W`F_!1|tBg literal 0 HcmV?d00001 diff --git a/shared_files/gzip_feature_test/single_byte_file.bin b/shared_files/gzip_feature_test/single_byte_file.bin new file mode 100644 index 000000000..6b2aaa764 --- /dev/null +++ b/shared_files/gzip_feature_test/single_byte_file.bin @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/shared_files/gzip_feature_test/test_gzip_encoded.ndjson b/shared_files/gzip_feature_test/test_gzip_encoded.ndjson new file mode 100644 index 0000000000000000000000000000000000000000..7afb5a130ac55b23871742a4c91f3fa41c458e02 GIT binary patch literal 981 zcmV;`11kIM5 zQ(0v4WE8`L4+dAZ#QQ^U|1aDPRoau8mvN=dgYUH$!~I1;wR!l%0a2)B ztr*T-rJm~mjlbD8LCGagQ4y)~Z*)Q@L9i%lT-3%$(PO3V<#HK3XVAftDsFY8gN$n( zl$cUtbZ7Dq|V&+a;;=5*!FrXNe` z?vC8GpF>)2*hW;xZPVWGVP8?hrJ#kkYsze|5dV0}S_h0OF34i>lFDRoe!dUlblG7w z9*+m{I2w=Q*982x%BkR=9V~ax6jFojxato`I3UsKJc^R|Boa!XQ>^GtZeyce5z3}ge0^$ab-|wi`s^+Rq=!R)2Noj ztSiD&*cNTFg}g(20QWsf8jC)oj2GOj-E#OKCWT!5R1(|xjq8RRb{Y9=e)0e1^kR55 z|8I45_Q3-$W#Xv0YS!`AC&8a;^d<%6r8k@YBs--{kSGkJ)8Qx@ILC0IYek(h9kwMZ znmfoZi}B7{RCJ9>6SoKVU@kAFv&q@b1vHHhnPjwp#Hbn%+)|>U7K7bnpQ?24@Yp## zo=4*(8YlgeUK|g$E^qwZlOoxG!VFa#5q_<9YY!^LrP?a_mH8)CNsI0Xd_MZlkzFat zQ0rYj*OljWdx-Gm9&QfCo)by?3Ny8a1Sm6nX-li?s*=iZk<9nRZ}AC=;52hLuOZSvRfMDO)^rtrxby*qUyt0N^ D!=B*u literal 0 HcmV?d00001 diff --git a/shared_files/gzip_feature_test/test_no_encoding.ndjson b/shared_files/gzip_feature_test/test_no_encoding.ndjson new file mode 100644 index 000000000..644aac3a3 --- /dev/null +++ b/shared_files/gzip_feature_test/test_no_encoding.ndjson @@ -0,0 +1 @@ +{"billablePeriod":{"end":"2016-10-30","start":"2016-10-30"},"contained":[{"birthDate":"1953-05-18","gender":"male","id":"patient","identifier":[{"system":"http://hl7.org/fhir/sid/us-mbi","type":{"coding":[{"code":"MC","display":"Patient's Medicare Number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"1S00E00KW61"}],"name":[{"family":"Quitzon246","given":["Rodolfo763"],"text":"Rodolfo763 Quitzon246 ([max 10 chars of first], [max 15 chars of last])"}],"resourceType":"Patient"},{"id":"provider-org","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/meda-prov-6","type":{"coding":[{"code":"PRN","display":"Provider number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"450702"},{"system":"https://bluebutton.cms.gov/resources/variables/fiss/fed-tax-nb","type":{"coding":[{"code":"TAX","display":"Tax ID number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"XX-XXXXXXX"},{"system":"http://hl7.org/fhir/sid/us-npi","type":{"coding":[{"code":"npi","display":"National Provider Identifier","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"8884381863"}],"resourceType":"Organization"}],"created":"2024-04-19T11:37:21-04:00","diagnosis":[{"diagnosisCodeableConcept":{"coding":[{"code":"P292","system":"http://hl7.org/fhir/sid/icd-10-cm"}]},"sequence":0}],"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd"}}],"facility":{"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd"}}]},"id":"f-LTEwMDE5NTYxNA","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/dcn","type":{"coding":[{"code":"uc","display":"Unique Claim ID","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"dcn-100195614"}],"insurance":[{"coverage":{"identifier":{"system":"https://bluebutton.cms.gov/resources/variables/fiss/payers-name","value":"MEDICARE"}},"focal":true,"sequence":0}],"meta":{"lastUpdated":"2023-04-13T18:18:27.334-04:00"},"patient":{"reference":"#patient"},"priority":{"coding":[{"code":"normal","display":"Normal","system":"http://terminology.hl7.org/CodeSystem/processpriority"}]},"provider":{"reference":"#provider-org"},"resourceType":"Claim","status":"active","supportingInfo":[{"category":{"coding":[{"code":"typeofbill","display":"Type of Bill","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBSupportingInfoType"}]},"code":{"coding":[{"code":"1","system":"https://bluebutton.cms.gov/resources/variables/fiss/freq-cd"}]},"sequence":1}],"total":{"currency":"USD","value":639.66},"type":{"coding":[{"code":"institutional","display":"Institutional","system":"http://terminology.hl7.org/CodeSystem/claim-type"}]},"use":"claim"} \ No newline at end of file