Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ORC-1098: [C++] Support specifying type ids or column names in cpp tools #1020

Merged
merged 2 commits into from
Jan 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion c++/src/Reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,15 @@ namespace orc {
if (ite != nameIdMap.end()) {
updateSelectedByTypeId(selectedColumns, ite->second);
} else {
throw ParseError("Invalid column selected " + fieldName);
bool first = true;
std::ostringstream ss;
ss << "Invalid column selected " << fieldName << ". Valid names are ";
for (auto it = nameIdMap.begin(); it != nameIdMap.end(); ++it) {
if (!first) ss << ", ";
ss << it->first;
first = false;
}
throw ParseError(ss.str());
}
}

Expand Down
45 changes: 41 additions & 4 deletions site/_docs/cpp-tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@ Displays the contents of the ORC file as a JSON document. With the
`columns` argument only the selected columns are printed.

~~~ shell
% orc-contents [--columns=1,2,...] <filename>
% orc-contents [options] <filename>
Options:
-h --help
-c --columns Comma separated list of top-level column fields
-t --columnTypeIds Comma separated list of column type ids
-n --columnNames Comma separated list of column names
-b --batch Batch size for reading
~~~

If you run it on the example file TestOrcFile.test1.orc, you'll see (without
Expand Down Expand Up @@ -115,7 +121,13 @@ to set the batch size which is 1024 rows by default. It is useful to check
if the ORC file is damaged.

~~~ shell
% orc-scan [--batch=<size>] <filename>
% orc-scan [options] <filename>...
Options:
-h --help
-c --columns Comma separated list of top-level column fields
-t --columnTypeIds Comma separated list of column type ids
-n --columnNames Comma separated list of column names
-b --batch Batch size for reading
~~~

If you run it on the example file TestOrcFile.test1.orc, you'll see:
Expand All @@ -135,7 +147,7 @@ With the `withIndex` option to include column statistics in each row group.
% orc-statistics [--withIndex] <filename>
~~~

If you run it on the example file TestOrcFile.TestOrcFile.columnProjection.orc
If you run it on the example file TestOrcFile.columnProjection.orc
you'll see:

~~~ shell
Expand Down Expand Up @@ -265,4 +277,29 @@ Has null: no
Minimum: 1059d81c9025a217
Maximum: ffc17f0e35e1a6c0
Total length: 15941
~~~
~~~

## orc-memory

Estimate the memory footprint for reading the ORC file.

~~~ shell
% orc-memory [options] <filename>
Options:
-h --help
-c --columns Comma separated list of top-level column fields
-t --columnTypeIds Comma separated list of column type ids
-n --columnNames Comma separated list of column names
-b --batch Batch size for reading
~~~

If you run it on the example file TestOrcFile.columnProjection.orc
you'll see:

~~~ shell
% orc-memory examples/TestOrcFile.columnProjection.orc,
Reader memory estimate: 202972
Batch memory estimate: 27000
Total memory estimate: 229972
Actual max memory used: 160381
~~~
4 changes: 4 additions & 0 deletions tools/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${CXX11_FLAGS} ${WARN_FLAGS}")

add_executable (orc-contents
FileContents.cc
ToolsHelper.cc
)

target_link_libraries (orc-contents
Expand All @@ -37,6 +38,7 @@ target_link_libraries (orc-contents

add_executable (orc-scan
FileScan.cc
ToolsHelper.cc
)

target_link_libraries (orc-scan
Expand All @@ -46,6 +48,7 @@ target_link_libraries (orc-scan

add_executable (orc-metadata
FileMetadata.cc
ToolsHelper.cc
)

target_link_libraries (orc-metadata
Expand All @@ -65,6 +68,7 @@ target_link_libraries (orc-statistics

add_executable (orc-memory
FileMemory.cc
ToolsHelper.cc
)

target_link_libraries (orc-memory
Expand Down
50 changes: 15 additions & 35 deletions tools/src/FileContents.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,11 @@
* limitations under the License.
*/

#include "orc/orc-config.hh"
#include "orc/ColumnPrinter.hh"
#include "orc/Exceptions.hh"
#include "ToolsHelper.hh"

#include <memory>
#include <string>
#include <iostream>
#include <string>

void printContents(const char* filename, const orc::RowReaderOptions& rowReaderOpts) {
orc::ReaderOptions readerOpts;
Expand All @@ -50,40 +47,23 @@ void printContents(const char* filename, const orc::RowReaderOptions& rowReaderO
}

int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "Usage: orc-contents <filename> [--columns=1,2,...]\n"
<< "Print contents of <filename>.\n"
<< "If columns are specified, only these top-level (logical) columns are printed.\n" ;
uint64_t batchSize; // not used
orc::RowReaderOptions rowReaderOptions;
bool success = parseOptions(&argc, &argv, &batchSize, &rowReaderOptions);

if (argc < 1 || !success) {
std::cerr << "Usage: orc-contents [options] <filename>...\n";
printOptions(std::cerr);
std::cerr << "Print contents of ORC files.\n";
return 1;
}
try {
const std::string COLUMNS_PREFIX = "--columns=";
std::list<uint64_t> cols;
char* filename = ORC_NULLPTR;

// Read command-line options
char *param, *value;
for (int i = 1; i < argc; i++) {
if ( (param = std::strstr(argv[i], COLUMNS_PREFIX.c_str())) ) {
value = std::strtok(param+COLUMNS_PREFIX.length(), "," );
while (value) {
cols.push_back(static_cast<uint64_t>(std::atoi(value)));
value = std::strtok(ORC_NULLPTR, "," );
}
} else {
filename = argv[i];
}
for (int i = 0; i < argc; ++i) {
try {
printContents(argv[i], rowReaderOptions);
} catch (std::exception& ex) {
std::cerr << "Caught exception in " << argv[i] << ": " << ex.what() << "\n";
return 1;
}
orc::RowReaderOptions rowReaderOpts;
if (cols.size() > 0) {
rowReaderOpts.include(cols);
}
if (filename != ORC_NULLPTR) {
printContents(filename, rowReaderOpts);
}
} catch (std::exception& ex) {
std::cerr << "Caught exception: " << ex.what() << "\n";
return 1;
}
return 0;
}
79 changes: 29 additions & 50 deletions tools/src/FileMemory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,12 @@
* limitations under the License.
*/

#include "orc/orc-config.hh"
#include "orc/ColumnPrinter.hh"
#include "orc/Exceptions.hh"
#include "ToolsHelper.hh"

#include <string>
#include <memory>
#include <iostream>
#include <map>
#include <exception>

class TestMemoryPool: public orc::MemoryPool {
private:
Expand Down Expand Up @@ -60,13 +57,9 @@ class TestMemoryPool: public orc::MemoryPool {
TestMemoryPool::~TestMemoryPool() {}

void processFile(const char* filename,
const std::list<uint64_t>& cols,
uint32_t batchSize) {
const orc::RowReaderOptions& rowReaderOpts,
uint64_t batchSize) {
orc::ReaderOptions readerOpts;
orc::RowReaderOptions rowReaderOpts;
if (cols.size() > 0) {
rowReaderOpts.include(cols);
}
std::unique_ptr<orc::MemoryPool> pool(new TestMemoryPool());
readerOpts.setMemoryPool(*(pool.get()));

Expand All @@ -76,7 +69,18 @@ void processFile(const char* filename,

std::unique_ptr<orc::ColumnVectorBatch> batch =
rowReader->createRowBatch(batchSize);
uint64_t readerMemory = reader->getMemoryUseByFieldId(cols);
uint64_t readerMemory;
if (rowReaderOpts.getIndexesSet()) {
readerMemory = reader->getMemoryUseByFieldId(rowReaderOpts.getInclude());
} else if (rowReaderOpts.getNamesSet()) {
readerMemory = reader->getMemoryUseByName(rowReaderOpts.getIncludeNames());
} else if (rowReaderOpts.getTypeIdsSet()) {
readerMemory = reader->getMemoryUseByTypeId(rowReaderOpts.getInclude());
} else {
// default is to select all columns
readerMemory = reader->getMemoryUseByName({});
}

uint64_t batchMemory = batch->getMemoryUsage();
while (rowReader->next(*batch)) {}
uint64_t actualMemory =
Expand All @@ -93,47 +97,22 @@ void processFile(const char* filename,
}

int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "Usage: orc-memory [--columns=column1,column2,...] "
<< "[--batch=rows_in_batch] <filename> \n";
uint64_t batchSize = 1000;
orc::RowReaderOptions rowReaderOptions;
bool success = parseOptions(&argc, &argv, &batchSize, &rowReaderOptions);
if (argc < 1 || !success) {
std::cerr << "Usage: orc-memory [options] <filename>...\n";
printOptions(std::cerr);
std::cerr << "Estimate the memory footprint for reading ORC files\n";
return 1;
}

const std::string COLUMNS_PREFIX = "--columns=";
const std::string BATCH_PREFIX = "--batch=";
char* filename = ORC_NULLPTR;

// Default parameters
std::list<uint64_t> cols;
uint32_t batchSize = 1000;

// Read command-line options
char *param, *value;
for (int i = 1; i < argc; i++) {
if ( (param = std::strstr(argv[i], COLUMNS_PREFIX.c_str())) ) {
value = std::strtok(param+COLUMNS_PREFIX.length(), "," );
while (value) {
cols.push_back(static_cast<uint64_t>(std::atoi(value)));
value = std::strtok(ORC_NULLPTR, "," );
}
} else if ( (param=strstr(argv[i], BATCH_PREFIX.c_str())) ) {
batchSize =
static_cast<uint32_t>(std::atoi(param+BATCH_PREFIX.length()));
} else {
filename = argv[i];
for (int i = 0; i < argc; ++i) {
try {
processFile(argv[i], rowReaderOptions, batchSize);
} catch (std::exception& ex) {
std::cerr << "Caught exception: " << ex.what() << "\n";
return 1;
}
}

if (filename == ORC_NULLPTR) {
std::cout << "Error: Filename not provided.\n";
return 1;
}

try {
processFile(filename, cols, batchSize);
return 0;
} catch (std::exception& ex) {
std::cerr << "Caught exception: " << ex.what() << "\n";
return 1;
}
return 0;
}
75 changes: 13 additions & 62 deletions tools/src/FileScan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,9 @@
* limitations under the License.
*/

#include "orc/ColumnPrinter.hh"
#include "ToolsHelper.hh"

#include "orc/Exceptions.hh"

#include <getopt.h>
#include <string>
#include <memory>
#include <iostream>
#include <string>

void scanFile(std::ostream & out, const char* filename, uint64_t batchSize,
const orc::RowReaderOptions& rowReaderOpts) {
Expand All @@ -46,64 +40,21 @@ void scanFile(std::ostream & out, const char* filename, uint64_t batchSize,
}

int main(int argc, char* argv[]) {
static struct option longOptions[] = {
{"help", no_argument, ORC_NULLPTR, 'h'},
{"batch", required_argument, ORC_NULLPTR, 'b'},
{"columns", required_argument, ORC_NULLPTR, 'c'},
{ORC_NULLPTR, 0, ORC_NULLPTR, 0}
};
bool helpFlag = false;
uint64_t batchSize = 1024;
std::list<uint64_t> cols;
orc::RowReaderOptions rowReaderOptions;
int opt;
char *tail;
do {
opt = getopt_long(argc, argv, "hb:c:", longOptions, ORC_NULLPTR);
switch (opt) {
case '?':
case 'h':
helpFlag = true;
opt = -1;
break;
case 'b':
batchSize = strtoul(optarg, &tail, 10);
if (*tail != '\0') {
fprintf(stderr, "The --batch parameter requires an integer option.\n");
return 1;
}
break;
case 'c': {
char *col = std::strtok(optarg, ",");
while (col) {
cols.push_back(static_cast<uint64_t>(std::atoi(col)));
col = std::strtok(ORC_NULLPTR, ",");
}
if (!cols.empty()) {
rowReaderOptions.include(cols);
}
break;
}
default: break;
}
} while (opt != -1);
argc -= optind;
argv += optind;

if (argc < 1 || helpFlag) {
std::cerr << "Usage: orc-scan [-h] [--help]\n"
<< " [-c 1,2,...] [--columns=1,2,...]\n"
<< " [-b<size>] [--batch=<size>] <filename>\n";
bool success = parseOptions(&argc, &argv, &batchSize, &rowReaderOptions);
if (argc < 1 || !success) {
std::cerr << "Usage: orc-scan [options] <filename>...\n";
printOptions(std::cerr);
std::cerr << "Scans and displays the row count of the ORC files.\n";
return 1;
} else {
for(int i=0; i < argc; ++i) {
try {
scanFile(std::cout, argv[i], batchSize, rowReaderOptions);
} catch (std::exception& ex) {
std::cerr << "Caught exception in " << argv[i]
<< ": " << ex.what() << "\n";
return 1;
}
}
for (int i = 0; i < argc; ++i) {
try {
scanFile(std::cout, argv[i], batchSize, rowReaderOptions);
} catch (std::exception& ex) {
std::cerr << "Caught exception in " << argv[i] << ": " << ex.what() << "\n";
return 1;
}
}
return 0;
Expand Down
Loading