Skip to content

Commit

Permalink
determine sci-name, hierarchy, synonym (close #10)
Browse files Browse the repository at this point in the history
Scientific Name, Hierarhy, Synonym can be expressed in different ways
in a Darwin Core Archive. Determine how are they expressed in a
concrete Darwin Core Archive file.
  • Loading branch information
dimus committed Feb 19, 2024
1 parent 1807f08 commit 85a0480
Show file tree
Hide file tree
Showing 33 changed files with 498 additions and 703 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

Add [#10]: find how sci-name, hierarchy, synonyms are expressed in a DwCA file.
Add [#4]: convenience meta object.

## [v0.0.1] - 2024-02-12 Mon
Expand Down
66 changes: 61 additions & 5 deletions dwca.go
Original file line number Diff line number Diff line change
@@ -1,24 +1,38 @@
package dwca

import (
"errors"
"os"
"path/filepath"
"strings"

"github.com/gnames/dwca/config"
"github.com/gnames/dwca/ent/eml"
"github.com/gnames/dwca/ent/meta"
"github.com/gnames/dwca/internal/ent/dcfile"
"github.com/gnames/dwca/internal/ent/diagn"
"github.com/gnames/gnparser"
)

type arch struct {
cfg config.Config
dcFile dcfile.DCFile
metaData *meta.Meta
emlData *eml.EML
diagn.Diagnostics
gnpPool chan gnparser.GNparser
}

func New(cfg config.Config, df dcfile.DCFile) Archive {
return &arch{cfg: cfg, dcFile: df}
res := &arch{cfg: cfg, dcFile: df}
poolSize := 5
gnpPool := make(chan gnparser.GNparser, poolSize)
for i := 0; i < poolSize; i++ {
cfgGNP := gnparser.NewConfig()
gnpPool <- gnparser.New(cfgGNP)
}
res.gnpPool = gnpPool
return res
}

// Config returns the configuration object of the archive.
Expand Down Expand Up @@ -60,10 +74,10 @@ func (a *arch) EML() *eml.EML {
return a.emlData
}

// CoreData takes an offset and a limit and returns a slice of slices of
// CoreSlice takes an offset and a limit and returns a slice of slices of
// strings, each slice representing a row of the core file. If limit and
// offset are provided, it returns the corresponding subset of the data.
func (a *arch) CoreData(offset, limit int) ([][]string, error) {
func (a *arch) CoreSlice(offset, limit int) ([][]string, error) {
return a.dcFile.CoreData(a.metaData, offset, limit)
}

Expand All @@ -74,12 +88,12 @@ func (a *arch) CoreStream(chCore chan<- []string) error {
return a.dcFile.CoreStream(a.metaData, chCore)
}

// ExtensionData takes an index, offset and limit and returns a slice of
// ExtensionSlice takes an index, offset and limit and returns a slice of
// slices of strings, each slice representing a row of the extension file.
// Index corresponds the index of the extension in the extension list.
// If limit and offset are provided, it returns the corresponding subset
// of the data.
func (a *arch) ExtensionData(index, offset, limit int) ([][]string, error) {
func (a *arch) ExtensionSlice(index, offset, limit int) ([][]string, error) {
return a.dcFile.ExtensionData(index, a.metaData, offset, limit)
}

Expand Down Expand Up @@ -122,3 +136,45 @@ func (a *arch) getEML(path string) error {

return nil
}

func (a *arch) Diagnose() (*diagn.Diagnostics, error) {
cs, exts, err := a.coreSample()
if err != nil {
return nil, err
}
if cs == nil {
return nil, errors.New("no data in the core file")
}

prs := <-a.gnpPool
defer func() { a.gnpPool <- prs }()

return diagn.New(prs, cs, exts), nil
}

func (a *arch) coreSample() (
[]map[string]string,
map[string]string,
error,
) {
dt, err := a.CoreSlice(0, 1000)
if err != nil {
return nil, nil, err
}
m := a.metaData.Simplify()
coreRows := make([]map[string]string, len(dt))
exts := make(map[string]string)
for k, v := range m.ExtensionsData {
exts[k] = strings.ToLower(v.Location)
}
for i, row := range dt {
coreRows[i] = make(map[string]string)
for j, val := range row {
if m.CoreData.FieldsIdx[j].Term == "" {
continue
}
coreRows[i][m.CoreData.FieldsIdx[j].Term] = val
}
}
return coreRows, exts, nil
}
102 changes: 100 additions & 2 deletions dwca_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/gnames/dwca"
"github.com/gnames/dwca/internal/ent/dcfile"
"github.com/gnames/dwca/internal/ent/diagn"
"github.com/stretchr/testify/assert"
)

Expand Down Expand Up @@ -50,7 +51,7 @@ func TestCoreData(t *testing.T) {
meta := arc.Meta()
assert.NotNil(meta)

data, err := arc.CoreData(v.offset, v.limit)
data, err := arc.CoreSlice(v.offset, v.limit)
assert.Nil(err)
assert.Equal(v.len, len(data))
assert.Equal(v.res00, data[0][0])
Expand Down Expand Up @@ -120,7 +121,7 @@ func TestExtensionData(t *testing.T) {
meta := arc.Meta()
assert.NotNil(meta)

data, err := arc.ExtensionData(0, v.offset, v.limit)
data, err := arc.ExtensionSlice(0, v.offset, v.limit)
assert.Nil(err)
assert.Equal(v.len, len(data))
if len(data) > 0 {
Expand Down Expand Up @@ -164,3 +165,100 @@ func TestExtensionStream(t *testing.T) {
assert.Equal(v.len, count)
}
}

func TestSciNameDiagnose(t *testing.T) {
assert := assert.New(t)

tests := []struct {
msg string
file string
snType diagn.SciNameType
}{
{"can", "canonical.tar.gz", diagn.SciNameCanonical},
{"full+auth", "full-auth-dup.tar.gz", diagn.SciNameFull},
{"full", "unknown.tar.gz", diagn.SciNameUnknown},
{"full", "composite.tar.gz", diagn.SciNameComposite},
}

for _, v := range tests {
path := filepath.Join("testdata", "diagn", "scinames", v.file)
arc, err := dwca.Factory(path)
assert.Nil(err)
assert.Implements((*dwca.Archive)(nil), arc)

err = arc.Load()
assert.Nil(err)

meta := arc.Meta()
assert.NotNil(meta)

diag, err := arc.Diagnose()
assert.Nil(err)
assert.Equal(v.snType.String(), diag.SciNameType.String())
}
}

func TestSynDiagnose(t *testing.T) {
assert := assert.New(t)

tests := []struct {
msg string
file string
snType diagn.SynonymType
}{
{"ext", "in_extension.tar.gz", diagn.SynExtension},
{"accepted", "in_core_accepted.tar.gz", diagn.SynAcceptedID},
{"hierarchy", "hierarchy_deprecated.tar.gz", diagn.SynHierarchy},
{"hierarchy", "hierarchy.tar.gz", diagn.SynHierarchy},
{"unknown", "unknown.tar.gz", diagn.SynUnknown},
}

for _, v := range tests {
path := filepath.Join("testdata", "diagn", "synonyms", v.file)
arc, err := dwca.Factory(path)
assert.Nil(err)
assert.Implements((*dwca.Archive)(nil), arc)

err = arc.Load()
assert.Nil(err)

meta := arc.Meta()
assert.NotNil(meta)

diag, err := arc.Diagnose()
assert.Nil(err)
assert.Equal(v.snType.String(), diag.SynonymType.String())
}
}

func TestHierDiagnose(t *testing.T) {
assert := assert.New(t)

tests := []struct {
msg string
file string
hType diagn.HierType
}{
{"tree", "tree.tar.gz", diagn.HierTree},
{"tree", "tree_depr.tar.gz", diagn.HierTree},
{"flat", "flat.tar.gz", diagn.HierFlat},
{"unknown", "unknown.tar.gz", diagn.HierUnknown},
}

for _, v := range tests {
path := filepath.Join("testdata", "diagn", "hierarchy", v.file)
arc, err := dwca.Factory(path)
assert.Nil(err)
assert.Implements((*dwca.Archive)(nil), arc)

err = arc.Load()
assert.Nil(err)

meta := arc.Meta()
assert.NotNil(meta)

diag, err := arc.Diagnose()
assert.Nil(err)
assert.Equal(v.hType.String(), diag.HierType.String())
}
}
6 changes: 5 additions & 1 deletion ent/meta/data.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package meta

type Data struct {
type MetaSimple struct {
CoreData
ExtensionsData map[string]ExtensionData
}
Expand All @@ -9,12 +9,16 @@ type CoreData struct {
Index int
Term string
TermFull string
Location string
FieldsData map[string]FieldData
FieldsIdx map[int]FieldData
}

type ExtensionData struct {
CoreIndex int
Location string
FieldsData map[string]FieldData
FieldsIdx map[int]FieldData
}

type FieldData struct {
Expand Down
6 changes: 4 additions & 2 deletions ent/meta/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ func TestToData(t *testing.T) {

m, err = meta.New(f)
assert.Nil(err)
data := m.ToData()
data := m.Simplify()
assert.NotNil(data)
assert.Equal(0, data.Index)
assert.Equal("", data.TermFull)
assert.Equal("http://rs.tdwg.org/dwc/terms/Taxon", data.TermFull)
assert.Equal("taxon", data.Term)
assert.Equal(3, len(data.ExtensionsData))
}
Loading

0 comments on commit 85a0480

Please sign in to comment.