19 #include <cudf/io/detail/orc.hpp>
24 #include <rmm/mr/device/per_device_resource.hpp>
29 #include <unordered_map>
56 std::optional<std::vector<std::string>> _columns;
59 std::vector<std::vector<size_type>> _stripes;
61 uint64_t _skip_rows = 0;
63 std::optional<size_type> _num_rows;
66 bool _use_index =
true;
69 bool _use_np_dtypes =
true;
74 std::vector<std::string> _decimal128_columns;
113 [[nodiscard]]
auto const&
get_columns()
const {
return _columns; }
120 [[nodiscard]]
auto const&
get_stripes()
const {
return _stripes; }
135 std::optional<size_type>
const&
get_num_rows()
const {
return _num_rows; }
172 void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
186 CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0),
"Can't set stripes along with skip_rows");
187 CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
188 "Can't set stripes along with num_rows");
189 _stripes = std::move(stripes);
202 CUDF_EXPECTS(rows == 0 or _stripes.empty(),
"Can't set both skip_rows along with stripes");
203 CUDF_EXPECTS(rows <= std::numeric_limits<int64_t>::max(),
"skip_rows is too large");
217 CUDF_EXPECTS(nrows >= 0,
"num_rows cannot be negative");
218 CUDF_EXPECTS(_stripes.empty(),
"Can't set both num_rows and stripes");
250 _decimal128_columns = std::move(val);
283 options._columns = std::move(col_names);
331 options._use_index = use;
343 options._use_np_dtypes = use;
355 options._timestamp_type = type;
367 options._decimal128_columns = std::move(val);
406 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
451 std::optional<table_input_metadata> _metadata;
453 std::map<std::string, std::string> _user_data;
455 std::shared_ptr<writer_compression_statistics> _compression_stats;
457 bool _enable_dictionary_sort =
true;
468 : _sink(sink), _table(
table)
543 return unaligned_stride - unaligned_stride % 8;
577 return _compression_stats;
617 CUDF_EXPECTS(size_bytes >= 64 << 10,
"64KB is the minimum stripe size");
618 _stripe_size_bytes = size_bytes;
633 CUDF_EXPECTS(size_rows >= 512,
"Maximum stripe size cannot be smaller than 512");
634 _stripe_size_rows = size_rows;
648 CUDF_EXPECTS(stride >= 512,
"Row index stride cannot be smaller than 512");
649 _row_index_stride = stride;
673 _user_data = std::move(metadata);
683 _compression_stats = std::move(comp_stats);
726 options._compression = comp;
743 options._stats_freq = val;
791 options._table = tbl;
803 options._metadata = std::move(meta);
815 options._user_data = std::move(
metadata);
826 std::shared_ptr<writer_compression_statistics>
const& comp_stats)
828 options._compression_stats = comp_stats;
840 options._enable_dictionary_sort = val;
897 std::optional<table_input_metadata> _metadata;
899 std::map<std::string, std::string> _user_data;
901 std::shared_ptr<writer_compression_statistics> _compression_stats;
903 bool _enable_dictionary_sort =
true;
974 return unaligned_stride - unaligned_stride % 8;
1001 return _compression_stats;
1041 CUDF_EXPECTS(size_bytes >= 64 << 10,
"64KB is the minimum stripe size");
1042 _stripe_size_bytes = size_bytes;
1057 CUDF_EXPECTS(size_rows >= 512,
"maximum stripe size cannot be smaller than 512");
1058 _stripe_size_rows = size_rows;
1072 CUDF_EXPECTS(stride >= 512,
"Row index stride cannot be smaller than 512");
1073 _row_index_stride = stride;
1100 _compression_stats = std::move(comp_stats);
1140 options._compression = comp;
1157 options._stats_freq = val;
1205 options._metadata = std::move(meta);
1216 std::map<std::string, std::string>
metadata)
1218 options._user_data = std::move(
metadata);
1229 std::shared_ptr<writer_compression_statistics>
const& comp_stats)
1231 options._compression_stats = comp_stats;
1243 options._enable_dictionary_sort = val;
Indicator for the logical data type of an element in a column.
Builds settings to use for write_orc_chunked().
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Settings to use for write_orc_chunked().
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
sink_info const & get_sink() const
Returns sink info.
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
auto get_row_index_stride() const
Returns the row index stride.
void set_row_index_stride(size_type stride)
Sets the row index stride.
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
void set_compression(compression_type comp)
Sets compression type.
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
auto const & get_metadata() const
Returns associated metadata.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
compression_type get_compression() const
Returns compression type.
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< orc::detail::writer > writer
Unique pointer to impl writer class.
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
orc_chunked_writer()=default
Default constructor, this should never be used. This is added just to satisfy cython.
Builds settings to use for read_orc().
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
orc_reader_options_builder & skip_rows(uint64_t rows)
Sets number of rows to skip from the start.
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
orc_reader_options_builder & num_rows(size_type nrows)
Sets number of row to read.
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
orc_reader_options && build()
move orc_reader_options member once it's built.
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Settings to use for read_orc().
std::optional< size_type > const & get_num_rows() const
Returns number of row to read.
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
uint64_t get_skip_rows() const
Returns number of rows to skip from the start.
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
void set_num_rows(size_type nrows)
Sets number of row to read.
auto const & get_columns() const
Returns names of the columns to read, if set.
void set_skip_rows(uint64_t rows)
Sets number of rows to skip from the start.
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
source_info const & get_source() const
Returns source info.
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Builds settings to use for write_orc().
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
orc_writer_options && build()
move orc_writer_options member once it's built.
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Settings to use for write_orc().
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
auto const & get_metadata() const
Returns associated metadata.
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
table_view get_table() const
Returns table to be written to output.
void set_metadata(table_input_metadata meta)
Sets associated metadata.
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
auto get_row_index_stride() const
Returns the row index stride.
void set_table(table_view tbl)
Sets table to be written to output.
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
void set_row_index_stride(size_type stride)
Sets the row index stride.
compression_type get_compression() const
Returns compression type.
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
sink_info const & get_sink() const
Returns sink info.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
constexpr size_type default_row_index_stride
10K rows default orc row index stride
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads an ORC dataset into a set of columns.
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
compression_type
Compression algorithms.
statistics_freq
Column statistics granularity type for parquet/orc writers.
@ AUTO
Automatically detect or select compression format.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
@ STATISTICS_NONE
No column statistics.
@ STATISTICS_PAGE
Per-page column statistics.
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to ORC format.
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
int32_t size_type
Row index type for columns and tables.
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
Destination information for write interfaces.
Source information for read interfaces.
Class definitions for (mutable)_table_view
Type declarations for libcudf.