libcudf/legacy/subword__tokenize_8hpp_source.html

 /*

  * Copyright (c) 2020-2023, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <cudf/column/column.hpp>

 #include <cudf/column/column_view.hpp>

 #include <cudf/strings/strings_column_view.hpp>


 namespace nvtext {


 struct hashed_vocabulary {

   uint16_t first_token_id{};

   uint16_t separator_token_id{};

   uint16_t unknown_token_id{};

   uint32_t outer_hash_a{};

   uint32_t outer_hash_b{};

   uint16_t num_bins{};

   std::unique_ptr<cudf::column> table;

   std::unique_ptr<cudf::column> bin_coefficients;

   std::unique_ptr<cudf::column> bin_offsets;

   std::unique_ptr<cudf::column>

     cp_metadata;

   std::unique_ptr<cudf::column>

     aux_cp_table;

 };


 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(

   std::string const& filename_hashed_vocabulary,

   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());


 struct tokenizer_result {

   uint32_t nrows_tensor{};

   uint32_t sequence_length{};

   std::unique_ptr<cudf::column> tensor_token_ids;

   std::unique_ptr<cudf::column> tensor_attention_mask;

   std::unique_ptr<cudf::column> tensor_metadata;

 };


 tokenizer_result subword_tokenize(

   cudf::strings_column_view const& strings,

   hashed_vocabulary const& vocabulary_table,

   uint32_t max_sequence_length,

   uint32_t stride,

   bool do_lower_case,

   bool do_truncate,

   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

   // end of group

 }  // namespace nvtext

cudf::strings_column_view
Given a column-view of strings type, an instance of this class provides a wrapper on this compound co...
Definition: strings_column_view.hpp:36

rmm::mr::device_memory_resource

column.hpp
Class definition for cudf::column.

column_view.hpp
column view class definitions

get_current_device_resource
device_memory_resource * get_current_device_resource()

nvtext::subword_tokenize
tokenizer_result subword_tokenize(cudf::strings_column_view const &strings, hashed_vocabulary const &vocabulary_table, uint32_t max_sequence_length, uint32_t stride, bool do_lower_case, bool do_truncate, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Creates a tokenizer that cleans the text, splits it into tokens and returns token-ids from an input v...

nvtext::load_vocabulary_file
std::unique_ptr< hashed_vocabulary > load_vocabulary_file(std::string const &filename_hashed_vocabulary, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Load the hashed vocabulary file into device memory.

nvtext
NVText APIs.
Definition: byte_pair_encoding.hpp:24

strings_column_view.hpp
Class definition for cudf::strings_column_view.

nvtext::hashed_vocabulary
The vocabulary data for use with the subword_tokenize function.
Definition: subword_tokenize.hpp:33

nvtext::hashed_vocabulary::outer_hash_a
uint32_t outer_hash_a
The a parameter for the outer hash.
Definition: subword_tokenize.hpp:37

nvtext::hashed_vocabulary::bin_coefficients
std::unique_ptr< cudf::column > bin_coefficients
Definition: subword_tokenize.hpp:42

nvtext::hashed_vocabulary::outer_hash_b
uint32_t outer_hash_b
The b parameter for the outer hash.
Definition: subword_tokenize.hpp:38

nvtext::hashed_vocabulary::aux_cp_table
std::unique_ptr< cudf::column > aux_cp_table
uint64 column, The auxiliary code point table to use for normalization
Definition: subword_tokenize.hpp:49

nvtext::hashed_vocabulary::bin_offsets
std::unique_ptr< cudf::column > bin_offsets
Definition: subword_tokenize.hpp:44

nvtext::hashed_vocabulary::separator_token_id
uint16_t separator_token_id
The separator token id in the vocabulary.
Definition: subword_tokenize.hpp:35

nvtext::hashed_vocabulary::table
std::unique_ptr< cudf::column > table
Definition: subword_tokenize.hpp:40

nvtext::hashed_vocabulary::cp_metadata
std::unique_ptr< cudf::column > cp_metadata
uint32 column, The code point metadata table to use for normalization
Definition: subword_tokenize.hpp:47

nvtext::hashed_vocabulary::first_token_id
uint16_t first_token_id
The first token id in the vocabulary.
Definition: subword_tokenize.hpp:34

nvtext::hashed_vocabulary::num_bins
uint16_t num_bins
Number of bins.
Definition: subword_tokenize.hpp:39

nvtext::hashed_vocabulary::unknown_token_id
uint16_t unknown_token_id
The unknown token id in the vocabulary.
Definition: subword_tokenize.hpp:36

nvtext::tokenizer_result
Result object for the subword_tokenize functions.
Definition: subword_tokenize.hpp:73

nvtext::tokenizer_result::sequence_length
uint32_t sequence_length
The number of token-ids in each row.
Definition: subword_tokenize.hpp:81

nvtext::tokenizer_result::nrows_tensor
uint32_t nrows_tensor
The number of rows for the output token-ids.
Definition: subword_tokenize.hpp:77

nvtext::tokenizer_result::tensor_token_ids
std::unique_ptr< cudf::column > tensor_token_ids
A vector of token-ids for each row.
Definition: subword_tokenize.hpp:88

nvtext::tokenizer_result::tensor_metadata
std::unique_ptr< cudf::column > tensor_metadata
The metadata for each tensor row.
Definition: subword_tokenize.hpp:101

nvtext::tokenizer_result::tensor_attention_mask
std::unique_ptr< cudf::column > tensor_attention_mask
This mask identifies which tensor-token-ids are valid.
Definition: subword_tokenize.hpp:94