Skip to content

Commit

Permalink
tika: implement Parse and Translate methods that return readers (#34)
Browse files Browse the repository at this point in the history
Always reading the response body into memory results in unnecessary and
undesired allocations, especially when sending large documents to Tika
for parsing.

The call method, which is called for all API requests, doesn't copy the
response anymore. Instead, callString copies it for the methods that
return strings and callUnmarshal does its own buffering using the JSON
decoder.

This commit also creates a new method, ParseReader, which allows users
of the client to omit copying the response, but instead receive its
reader directly, which they are responsible to close.

As with ParseReader, a TranslateReader is provided for the same reasons.
  • Loading branch information
tmaxmax authored Sep 2, 2021
1 parent 1e81b65 commit ffdbbef
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 10 deletions.
41 changes: 31 additions & 10 deletions tika/tika.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"reflect"
"strings"
Expand Down Expand Up @@ -113,9 +112,9 @@ const (
// parsing. See ParseRecursive and MetaRecursive.
const XTIKAContent = "X-TIKA:content"

// call makes the given request to c and returns the result as a []byte and
// error. call returns an error if the response code is not 200 StatusOK.
func (c *Client) call(ctx context.Context, input io.Reader, method, path string, header http.Header) ([]byte, error) {
// call makes the given request to c and returns the response body.
// call returns an error and a nil reader if the response code is not 200 StatusOK.
func (c *Client) call(ctx context.Context, input io.Reader, method, path string, header http.Header) (io.ReadCloser, error) {
if c.httpClient == nil {
c.httpClient = http.DefaultClient
}
Expand All @@ -130,11 +129,11 @@ func (c *Client) call(ctx context.Context, input io.Reader, method, path string,
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
return nil, ClientError{resp.StatusCode}
}
return ioutil.ReadAll(resp.Body)
return resp.Body, nil
}

// callString makes the given request to c and returns the result as a string
Expand All @@ -144,15 +143,28 @@ func (c *Client) callString(ctx context.Context, input io.Reader, method, path s
if err != nil {
return "", err
}
return string(body), nil
defer body.Close()

b := &strings.Builder{}
if _, err := io.Copy(b, body); err != nil {
return "", err
}

return b.String(), nil
}

// Parse parses the given input, returning the body of the input and an error.
// Parse parses the given input, returning the body of the input as a string and an error.
// If the error is not nil, the body is undefined.
func (c *Client) Parse(ctx context.Context, input io.Reader) (string, error) {
return c.callString(ctx, input, "PUT", "/tika")
}

// ParseReader parses the given input, returning the body of the input as a reader and an error.
// If the error is nil, the returned reader must be closed, else, the reader is nil.
func (c *Client) ParseReader(ctx context.Context, input io.Reader) (io.ReadCloser, error) {
return c.call(ctx, input, "PUT", "/tika", nil)
}

// ParseRecursive parses the given input and all embedded documents, returning a
// list of the contents of the input with one element per document. See
// MetaRecursive for access to all metadata fields. If the error is not nil, the
Expand Down Expand Up @@ -228,8 +240,9 @@ func (c *Client) MetaRecursiveType(ctx context.Context, input io.Reader, content
if err != nil {
return nil, err
}
defer body.Close()
var m []map[string]interface{}
if err := json.Unmarshal(body, &m); err != nil {
if err := json.NewDecoder(body).Decode(&m); err != nil {
return nil, err
}
var r []map[string][]string
Expand Down Expand Up @@ -262,6 +275,13 @@ func (c *Client) Translate(ctx context.Context, input io.Reader, t Translator, s
return c.callString(ctx, input, "POST", fmt.Sprintf("/translate/all/%s/%s/%s", t, src, dst))
}

// TranslateReader translates the given input from src language to dst language using t.
// It returns the translated document as a reader. If an error occurs, the reader is nil, else, the reader
// must be closed by the caller after usage.
func (c *Client) TranslateReader(ctx context.Context, input io.Reader, t Translator, src, dst string) (io.ReadCloser, error) {
return c.call(ctx, input, "POST", fmt.Sprintf("/translate/all/%s/%s/%s", t, src, dst), nil)
}

// Version returns the default hello message from Tika server.
func (c *Client) Version(ctx context.Context) (string, error) {
return c.callString(ctx, nil, "GET", "/version")
Expand All @@ -275,7 +295,8 @@ func (c *Client) callUnmarshal(ctx context.Context, path string, v interface{})
if err != nil {
return err
}
return json.Unmarshal(body, v)
defer body.Close()
return json.NewDecoder(body).Decode(v)
}

// Parsers returns the list of available parsers and an error. If the error is
Expand Down
43 changes: 43 additions & 0 deletions tika/tika_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
"net/http/httptest"
"os"
Expand Down Expand Up @@ -74,6 +75,27 @@ func TestParse(t *testing.T) {
}
}

func TestParseReader(t *testing.T) {
want := "test value"
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
fmt.Fprint(w, want)
}))
defer ts.Close()
c := NewClient(nil, ts.URL)
body, err := c.ParseReader(context.Background(), nil)
if err != nil {
t.Fatalf("ParseReader returned nil, want %q", want)
}
defer body.Close()
got, err := ioutil.ReadAll(body)
if err != nil {
t.Fatalf("Reading the returned body failed: %v", err)
}
if s := string(got); s != want {
t.Errorf("ParseReader got %q, want %q", s, want)
}
}

func TestParseRecursive(t *testing.T) {
tests := []struct {
response string
Expand Down Expand Up @@ -379,6 +401,27 @@ func TestTranslate(t *testing.T) {
}
}

func TestTranslateReader(t *testing.T) {
want := "test value"
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
fmt.Fprint(w, want)
}))
defer ts.Close()
c := NewClient(nil, ts.URL)
body, err := c.TranslateReader(context.Background(), nil, "translator", "src", "dst")
if err != nil {
t.Fatalf("TranslateReader returned nil, want %q", want)
}
defer body.Close()
got, err := ioutil.ReadAll(body)
if err != nil {
t.Fatalf("Reading the returned body failed: %v", err)
}
if s := string(got); s != want {
t.Errorf("TranslateReader got %q, want %q", s, want)
}
}

func TestParsers(t *testing.T) {
tests := []struct {
response string
Expand Down

0 comments on commit ffdbbef

Please sign in to comment.