libcudf  24.04.00
orc.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/detail/orc.hpp>
20 #include <cudf/io/types.hpp>
22 #include <cudf/types.hpp>
23 
24 #include <rmm/mr/device/per_device_resource.hpp>
25 
26 #include <memory>
27 #include <optional>
28 #include <string>
29 #include <unordered_map>
30 #include <vector>
31 
32 namespace cudf {
33 namespace io {
40 constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024;
41 constexpr size_type default_stripe_size_rows = 1000000;
43 
48 
53  source_info _source;
54 
55  // Names of column to read; `nullopt` is all
56  std::optional<std::vector<std::string>> _columns;
57 
58  // List of individual stripes to read (ignored if empty)
59  std::vector<std::vector<size_type>> _stripes;
60  // Rows to skip from the start; ORC stores the number of rows as uint64_t
61  uint64_t _skip_rows = 0;
62  // Rows to read; `nullopt` is all
63  std::optional<size_type> _num_rows;
64 
65  // Whether to use row index to speed-up reading
66  bool _use_index = true;
67 
68  // Whether to use numpy-compatible dtypes
69  bool _use_np_dtypes = true;
70  // Cast timestamp columns to a specific type
71  data_type _timestamp_type{type_id::EMPTY};
72 
73  // Columns that should be read as Decimal128
74  std::vector<std::string> _decimal128_columns;
75 
77 
83  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
84 
85  public:
91  orc_reader_options() = default;
92 
100 
106  [[nodiscard]] source_info const& get_source() const { return _source; }
107 
113  [[nodiscard]] auto const& get_columns() const { return _columns; }
114 
120  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
121 
127  uint64_t get_skip_rows() const { return _skip_rows; }
128 
135  std::optional<size_type> const& get_num_rows() const { return _num_rows; }
136 
142  bool is_enabled_use_index() const { return _use_index; }
143 
149  bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
150 
156  data_type get_timestamp_type() const { return _timestamp_type; }
157 
163  std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
164 
165  // Setters
166 
172  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
173 
184  void set_stripes(std::vector<std::vector<size_type>> stripes)
185  {
186  CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0), "Can't set stripes along with skip_rows");
187  CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
188  "Can't set stripes along with num_rows");
189  _stripes = std::move(stripes);
190  }
191 
200  void set_skip_rows(uint64_t rows)
201  {
202  CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
203  CUDF_EXPECTS(rows <= std::numeric_limits<int64_t>::max(), "skip_rows is too large");
204  _skip_rows = rows;
205  }
206 
216  {
217  CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
218  CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
219  _num_rows = nrows;
220  }
221 
227  void enable_use_index(bool use) { _use_index = use; }
228 
234  void enable_use_np_dtypes(bool use) { _use_np_dtypes = use; }
235 
241  void set_timestamp_type(data_type type) { _timestamp_type = type; }
242 
248  void set_decimal128_columns(std::vector<std::string> val)
249  {
250  _decimal128_columns = std::move(val);
251  }
252 };
253 
258  orc_reader_options options;
259 
260  public:
266  explicit orc_reader_options_builder() = default;
267 
273  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
274 
281  orc_reader_options_builder& columns(std::vector<std::string> col_names)
282  {
283  options._columns = std::move(col_names);
284  return *this;
285  }
286 
293  orc_reader_options_builder& stripes(std::vector<std::vector<size_type>> stripes)
294  {
295  options.set_stripes(std::move(stripes));
296  return *this;
297  }
298 
306  {
307  options.set_skip_rows(rows);
308  return *this;
309  }
310 
318  {
319  options.set_num_rows(nrows);
320  return *this;
321  }
322 
330  {
331  options._use_index = use;
332  return *this;
333  }
334 
342  {
343  options._use_np_dtypes = use;
344  return *this;
345  }
346 
354  {
355  options._timestamp_type = type;
356  return *this;
357  }
358 
365  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
366  {
367  options._decimal128_columns = std::move(val);
368  return *this;
369  }
370 
374  operator orc_reader_options&&() { return std::move(options); }
375 
383  orc_reader_options&& build() { return std::move(options); }
384 };
385 
404  orc_reader_options const& options,
405  rmm::cuda_stream_view stream = cudf::get_default_stream(),
406  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
407  // end of group
419 
429 static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
430 static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
431 
436  // Specify the sink to use for writer output
437  sink_info _sink;
438  // Specify the compression format to use
440  // Specify frequency of statistics collection
441  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
442  // Maximum size of each stripe (unless smaller than a single row group)
443  size_t _stripe_size_bytes = default_stripe_size_bytes;
444  // Maximum number of rows in stripe (unless smaller than a single row group)
445  size_type _stripe_size_rows = default_stripe_size_rows;
446  // Row index stride (maximum number of rows in each row group)
447  size_type _row_index_stride = default_row_index_stride;
448  // Set of columns to output
449  table_view _table;
450  // Optional associated metadata
451  std::optional<table_input_metadata> _metadata;
452  // Optional footer key_value_metadata
453  std::map<std::string, std::string> _user_data;
454  // Optional compression statistics
455  std::shared_ptr<writer_compression_statistics> _compression_stats;
456  // Specify whether string dictionaries should be alphabetically sorted
457  bool _enable_dictionary_sort = true;
458 
460 
467  explicit orc_writer_options(sink_info const& sink, table_view const& table)
468  : _sink(sink), _table(table)
469  {
470  }
471 
472  public:
478  explicit orc_writer_options() = default;
479 
489 
495  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
496 
502  [[nodiscard]] compression_type get_compression() const { return _compression; }
503 
509  [[nodiscard]] bool is_enabled_statistics() const
510  {
511  return _stats_freq != statistics_freq::STATISTICS_NONE;
512  }
513 
519  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
520 
526  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
527 
533  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
534 
540  auto get_row_index_stride() const
541  {
542  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
543  return unaligned_stride - unaligned_stride % 8;
544  }
545 
551  [[nodiscard]] table_view get_table() const { return _table; }
552 
558  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
559 
565  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
566  {
567  return _user_data;
568  }
569 
575  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
576  {
577  return _compression_stats;
578  }
579 
585  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
586 
587  // Setters
588 
594  void set_compression(compression_type comp) { _compression = comp; }
595 
606  void enable_statistics(statistics_freq val) { _stats_freq = val; }
607 
615  void set_stripe_size_bytes(size_t size_bytes)
616  {
617  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
618  _stripe_size_bytes = size_bytes;
619  }
620 
632  {
633  CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
634  _stripe_size_rows = size_rows;
635  }
636 
647  {
648  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
649  _row_index_stride = stride;
650  }
651 
657  void set_table(table_view tbl) { _table = tbl; }
658 
664  void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); }
665 
671  void set_key_value_metadata(std::map<std::string, std::string> metadata)
672  {
673  _user_data = std::move(metadata);
674  }
675 
681  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
682  {
683  _compression_stats = std::move(comp_stats);
684  }
685 
691  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
692 };
693 
698  orc_writer_options options;
699 
700  public:
707 
714  orc_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table}
715  {
716  }
717 
725  {
726  options._compression = comp;
727  return *this;
728  }
729 
742  {
743  options._stats_freq = val;
744  return *this;
745  }
746 
754  {
755  options.set_stripe_size_bytes(val);
756  return *this;
757  }
758 
766  {
767  options.set_stripe_size_rows(val);
768  return *this;
769  }
770 
778  {
779  options.set_row_index_stride(val);
780  return *this;
781  }
782 
790  {
791  options._table = tbl;
792  return *this;
793  }
794 
802  {
803  options._metadata = std::move(meta);
804  return *this;
805  }
806 
813  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
814  {
815  options._user_data = std::move(metadata);
816  return *this;
817  }
818 
826  std::shared_ptr<writer_compression_statistics> const& comp_stats)
827  {
828  options._compression_stats = comp_stats;
829  return *this;
830  }
831 
839  {
840  options._enable_dictionary_sort = val;
841  return *this;
842  }
843 
847  operator orc_writer_options&&() { return std::move(options); }
848 
856  orc_writer_options&& build() { return std::move(options); }
857 };
858 
872 void write_orc(orc_writer_options const& options,
873  rmm::cuda_stream_view stream = cudf::get_default_stream());
874 
879 
884  // Specify the sink to use for writer output
885  sink_info _sink;
886  // Specify the compression format to use
888  // Specify granularity of statistics collection
889  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
890  // Maximum size of each stripe (unless smaller than a single row group)
891  size_t _stripe_size_bytes = default_stripe_size_bytes;
892  // Maximum number of rows in stripe (unless smaller than a single row group)
893  size_type _stripe_size_rows = default_stripe_size_rows;
894  // Row index stride (maximum number of rows in each row group)
895  size_type _row_index_stride = default_row_index_stride;
896  // Optional associated metadata
897  std::optional<table_input_metadata> _metadata;
898  // Optional footer key_value_metadata
899  std::map<std::string, std::string> _user_data;
900  // Optional compression statistics
901  std::shared_ptr<writer_compression_statistics> _compression_stats;
902  // Specify whether string dictionaries should be alphabetically sorted
903  bool _enable_dictionary_sort = true;
904 
906 
912  chunked_orc_writer_options(sink_info const& sink) : _sink(sink) {}
913 
914  public:
920  explicit chunked_orc_writer_options() = default;
921 
930 
936  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
937 
943  [[nodiscard]] compression_type get_compression() const { return _compression; }
944 
950  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
951 
957  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
958 
964  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
965 
971  auto get_row_index_stride() const
972  {
973  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
974  return unaligned_stride - unaligned_stride % 8;
975  }
976 
982  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
983 
989  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
990  {
991  return _user_data;
992  }
993 
999  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
1000  {
1001  return _compression_stats;
1002  }
1003 
1009  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
1010 
1011  // Setters
1012 
1018  void set_compression(compression_type comp) { _compression = comp; }
1019 
1030  void enable_statistics(statistics_freq val) { _stats_freq = val; }
1031 
1039  void set_stripe_size_bytes(size_t size_bytes)
1040  {
1041  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
1042  _stripe_size_bytes = size_bytes;
1043  }
1044 
1056  {
1057  CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
1058  _stripe_size_rows = size_rows;
1059  }
1060 
1071  {
1072  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
1073  _row_index_stride = stride;
1074  }
1075 
1081  void metadata(table_input_metadata meta) { _metadata = std::move(meta); }
1082 
1088  void set_key_value_metadata(std::map<std::string, std::string> metadata)
1089  {
1090  _user_data = std::move(metadata);
1091  }
1092 
1098  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
1099  {
1100  _compression_stats = std::move(comp_stats);
1101  }
1102 
1108  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
1109 };
1110 
1116 
1117  public:
1124 
1130  explicit chunked_orc_writer_options_builder(sink_info const& sink) : options{sink} {}
1131 
1139  {
1140  options._compression = comp;
1141  return *this;
1142  }
1143 
1156  {
1157  options._stats_freq = val;
1158  return *this;
1159  }
1160 
1168  {
1169  options.set_stripe_size_bytes(val);
1170  return *this;
1171  }
1172 
1180  {
1181  options.set_stripe_size_rows(val);
1182  return *this;
1183  }
1184 
1192  {
1193  options.set_row_index_stride(val);
1194  return *this;
1195  }
1196 
1204  {
1205  options._metadata = std::move(meta);
1206  return *this;
1207  }
1208 
1216  std::map<std::string, std::string> metadata)
1217  {
1218  options._user_data = std::move(metadata);
1219  return *this;
1220  }
1221 
1229  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1230  {
1231  options._compression_stats = comp_stats;
1232  return *this;
1233  }
1234 
1242  {
1243  options._enable_dictionary_sort = val;
1244  return *this;
1245  }
1246 
1250  operator chunked_orc_writer_options&&() { return std::move(options); }
1251 
1259  chunked_orc_writer_options&& build() { return std::move(options); }
1260 };
1261 
1284  public:
1289  orc_chunked_writer() = default;
1290 
1298  rmm::cuda_stream_view stream = cudf::get_default_stream());
1299 
1307 
1311  void close();
1312 
1314  std::unique_ptr<orc::detail::writer> writer;
1315 };
1316  // end of group
1318 } // namespace io
1319 } // namespace cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:241
Builds settings to use for write_orc_chunked().
Definition: orc.hpp:1114
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1241
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
Definition: orc.hpp:1259
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1167
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:1179
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1228
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1215
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1138
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1203
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
Definition: orc.hpp:1130
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1155
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:1191
Settings to use for write_orc_chunked().
Definition: orc.hpp:883
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1039
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1081
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1088
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1098
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:936
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:964
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:971
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:1070
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
Definition: orc.hpp:950
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1018
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:989
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1108
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:982
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:1009
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:943
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:1055
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:999
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:957
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1030
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
Definition: orc.hpp:1283
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< orc::detail::writer > writer
Unique pointer to impl writer class.
Definition: orc.hpp:1314
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
orc_chunked_writer()=default
Default constructor, this should never be used. This is added just to satisfy cython.
Builds settings to use for read_orc().
Definition: orc.hpp:257
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:329
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
Definition: orc.hpp:365
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:341
orc_reader_options_builder & skip_rows(uint64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:305
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
Definition: orc.hpp:273
orc_reader_options_builder & num_rows(size_type nrows)
Sets number of row to read.
Definition: orc.hpp:317
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
Definition: orc.hpp:293
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:281
orc_reader_options && build()
move orc_reader_options member once it's built.
Definition: orc.hpp:383
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:353
Settings to use for read_orc().
Definition: orc.hpp:52
std::optional< size_type > const & get_num_rows() const
Returns number of row to read.
Definition: orc.hpp:135
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:234
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
Definition: orc.hpp:120
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
Definition: orc.hpp:248
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:227
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:172
uint64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: orc.hpp:127
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
Definition: orc.hpp:184
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
Definition: orc.hpp:156
void set_num_rows(size_type nrows)
Sets number of row to read.
Definition: orc.hpp:215
auto const & get_columns() const
Returns names of the columns to read, if set.
Definition: orc.hpp:113
void set_skip_rows(uint64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:200
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
source_info const & get_source() const
Returns source info.
Definition: orc.hpp:106
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
Definition: orc.hpp:149
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
Definition: orc.hpp:142
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
Definition: orc.hpp:163
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:241
Builds settings to use for write_orc().
Definition: orc.hpp:697
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:789
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:777
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
Definition: orc.hpp:741
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:801
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: orc.hpp:714
orc_writer_options && build()
move orc_writer_options member once it's built.
Definition: orc.hpp:856
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:813
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:825
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:765
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:838
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:724
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:753
Settings to use for write_orc().
Definition: orc.hpp:435
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:606
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:558
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:565
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:575
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
Definition: orc.hpp:509
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:526
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:631
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
Definition: orc.hpp:671
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:533
table_view get_table() const
Returns table to be written to output.
Definition: orc.hpp:551
void set_metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:664
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
Definition: orc.hpp:519
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:681
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:540
void set_table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:657
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:594
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:691
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:646
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:502
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:585
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:615
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:495
Metadata for a table.
Definition: io/types.hpp:858
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:187
A set of cudf::column's of the same size.
Definition: table.hpp:40
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
Definition: orc.hpp:41
constexpr size_type default_row_index_stride
10K rows default orc row index stride
Definition: orc.hpp:42
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads an ORC dataset into a set of columns.
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
Definition: orc.hpp:40
compression_type
Compression algorithms.
Definition: io/types.hpp:56
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:95
@ AUTO
Automatically detect or select compression format.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:97
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:96
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:98
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to ORC format.
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:177
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:93
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:34
Destination information for write interfaces.
Definition: io/types.hpp:489
Source information for read interfaces.
Definition: io/types.hpp:314
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:269
Class definitions for (mutable)_table_view
Type declarations for libcudf.