Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packaging/nnfw.spec
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ GGMA package for on-device generative AI framework
%defattr(-,root,root,-)
%ifarch arm armv7l armv7hl aarch64 x86_64 %ix86 riscv64
%{_libdir}/ggma/libggma_api.so
%{_libdir}/ggma/libggma_tokenize.so
%{_includedir}/ggma/*
%{_libdir}/pkgconfig/ggma.pc
%endif
Expand Down
13 changes: 12 additions & 1 deletion runtime/ggma/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ if (NOT BUILD_GGMA_API)
endif()

file(GLOB_RECURSE API_SRC "src/*.cc")
list(FILTER API_SRC EXCLUDE REGEX "src/tokenize/.*")

set(GGMA_DEV ggma-dev)
add_library(${GGMA_DEV} SHARED ${API_SRC})

# Public headers to publish
set(GGMA_API_HEADERS include/ggma_api.h include/ggma_types.h
include/ggma_context.h include/ggma_generate.h)
include/ggma_context.h include/ggma_generate.h
include/ggma_tokenize.h )

# GGMA install directory (same as ONERT_INSTALL_APIDIR)
set(GGMA_INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR}/ggma)
Expand All @@ -18,6 +20,11 @@ set(GGMA_INSTALL_APIDIR ${CMAKE_INSTALL_LIBDIR}/ggma)
target_link_libraries(${GGMA_DEV} PRIVATE jsoncpp ${LIB_PTHREAD})
target_link_libraries(${GGMA_DEV} PRIVATE nnfw-dev)

# Add tokenize subdirectory
add_subdirectory(src/tokenize)

# Link tokenize library
target_link_libraries(${GGMA_DEV} PUBLIC ggma_tokenize)
# NOTE Below line is added to remove warning for android build
# It will be removed after android build uses gold linker
if (ANDROID)
Expand All @@ -38,6 +45,10 @@ install(TARGETS ${GGMA_DEV}
LIBRARY DESTINATION ${GGMA_INSTALL_APIDIR}
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ggma)

# Install ggma_tokenize library
install(TARGETS ggma_tokenize
LIBRARY DESTINATION ${GGMA_INSTALL_APIDIR})

# Install pkg-config file for GGMA API
configure_file(ggma.pc.in ggma.pc @ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggma.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
1 change: 1 addition & 0 deletions runtime/ggma/include/ggma_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
/* Include all GGMA API headers */
#include "ggma_context.h"
#include "ggma_generate.h"
#include "ggma_tokenize.h"

#ifdef __cplusplus
extern "C" {
Expand Down
104 changes: 104 additions & 0 deletions runtime/ggma/include/ggma_tokenize.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* @file ggma_Tokenizer.h
* @brief This file defines the GGMA Tokenizer interface.
*/
#ifndef __GGMA_GGMA_TOKENIZE_H__
#define __GGMA_GGMA_TOKENIZE_H__

#include "ggma_types.h"

#ifdef __cplusplus
extern "C" {
#endif

/**
* @brief Opaque handle to a GGMA tokenizer.
*
* A GGMA tokenizer encapsulates all necessary components for text tokenization,
* including the tokenizer model and vocabulary.
*/
typedef struct ggma_tokenizer ggma_tokenizer;

/**
* @brief Creates a GGMA tokenizer from a specified tokenizer path.
*
* This function loads the necessary tokenizer components from the given tokenizer path
* and initializes a GGMA tokenizer handle.
*
* @param[out] tokenizer Pointer to the tokenizer object created from the given path
* @param[in] tokenizer_path The path to the directory containing the tokenizer model and
* vocabulary
* @return @c GGMA_STATUS_NO_ERROR on success, or an appropriate error code on failure
* (e.g., GGMA_STATUS_UNEXPECTED_NULL if @p tokenizer_path or @p tokenizer is NULL,
* or if the tokenizer cannot be created).
*/
GGMA_STATUS ggma_create_tokenizer(ggma_tokenizer **tokenizer, const char *tokenizer_path);

/**
* @brief Frees all resources associated with a GGMA tokenizer.
*
* @param[in] tokenizer The GGMA tokenizer to free. This handle will be invalid after the call.
* @return @c GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure.
*/
GGMA_STATUS ggma_free_tokenizer(ggma_tokenizer *tokenizer);

/**
* @brief Tokenizes an input text string into a sequence of token IDs.
*
* This function uses the vocabulary from the created tokenizer to convert
* the input text into a series of numerical token IDs.
*
* @param[in] tokenizer The GGMA tokenizer handle for tokenization.
* @param[in] text The null-terminated text string to be tokenized.
* @param[in] text_len The length of the text in bytes. If the text is null-terminated,
* this can be 0 and the length will be determined internally.
* @param[out] tokens Output buffer for generated token IDs.
* @param[in] n_tokens_max Maximum number of tokens the @p tokens buffer can hold.
* @param[out] n_tokens A pointer to a variable that will receive the actual number of
* tokens written to the @p tokens buffer.
* @return @c GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure
* (e.g., GGMA_STATUS_UNEXPECTED_NULL if @p tokenizer or @p text is NULL,
* or if the output buffer is too small).
*/
GGMA_STATUS ggma_tokenize(const ggma_tokenizer *tokenizer, const char *text, size_t text_len,
int32_t *tokens, size_t n_tokens_max, size_t *n_tokens);

/**
* @brief Detokenizes a sequence of token IDs back into a text string.
*
* This function uses the vocabulary from the created tokenizer to convert
* the sequence of token IDs back into a human-readable text string.
*
* @param[in] tokenizer The GGMA tokenizer handle for detokenization.
* @param[in] tokens A pointer to the input buffer containing the token IDs to be detokenized.
* @param[in] n_tokens The number of tokens in the @p tokens buffer.
* @param[out] text A pointer to the output buffer where the detokenized text will be stored.
* @param[in] text_len The maximum size of the @p text buffer in bytes.
* @return @c GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure
* (e.g., GGMA_STATUS_UNEXPECTED_NULL if @p tokenizer or @p tokens is NULL,
* or if the output buffer is too small).
*/
GGMA_STATUS ggma_detokenize(const ggma_tokenizer *tokenizer, const int32_t *tokens, size_t n_tokens,
char *text, size_t text_len);

#ifdef __cplusplus
}
#endif

#endif // __GGMA_GGMA_TOKENIZE_H__
97 changes: 97 additions & 0 deletions runtime/ggma/src/ggma_tokenize.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "ggma_tokenize.h"
#include "ggma_types.h"
#include "tokenize/TokenizerFactory.h"

#include <string>

extern "C" {

GGMA_STATUS ggma_create_tokenizer(ggma_tokenizer **tokenizer, const char *tokenizer_path)
{
if (!tokenizer || !tokenizer_path)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
std::string tokenizer_id = "sentencepiece";
auto impl = ggma::TokenizerFactory::create(tokenizer_id, tokenizer_path);

*tokenizer = reinterpret_cast<ggma_tokenizer *>(impl);
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

GGMA_STATUS ggma_free_tokenizer(ggma_tokenizer *tokenizer)
{
if (!tokenizer)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
auto impl = reinterpret_cast<ggma::Tokenizer *>(tokenizer);
delete impl;
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

GGMA_STATUS ggma_tokenize(const ggma_tokenizer *tokenizer, const char *text, size_t text_len,
int32_t *tokens, size_t n_tokens_max, size_t *n_tokens)
{
if (!tokenizer || !text || !tokens || !n_tokens)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
auto impl = reinterpret_cast<const ggma::Tokenizer *>(tokenizer);
impl->tokenize(text, text_len, tokens, n_tokens_max, n_tokens);
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

GGMA_STATUS ggma_detokenize(const ggma_tokenizer *tokenizer, const int32_t *tokens, size_t n_tokens,
char *text, size_t text_len)
{
if (!tokenizer || !tokens || !text)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
auto impl = reinterpret_cast<const ggma::Tokenizer *>(tokenizer);
impl->detokenize(tokens, n_tokens, text, text_len);
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

} // extern "C"
30 changes: 30 additions & 0 deletions runtime/ggma/src/tokenize/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Tokenize module CMakeLists.txt
# Find SentencePiece if available
nnfw_find_package(SentencePiece QUIET)

set(TOKENIZE_SOURCES
Tokenizer.h
TokenizerFactory.h
TokenizerFactory.cc
)

if(SentencePiece_FOUND)
list(APPEND TOKENIZE_SOURCES TokenizerSentencePiece.cc)
endif()

# Create tokenize as a shared library
add_library(ggma_tokenize SHARED ${TOKENIZE_SOURCES})

target_include_directories(ggma_tokenize PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../include)

if(SentencePiece_FOUND)
target_link_libraries(ggma_tokenize PRIVATE ${SentencePiece_LIBRARIES})
target_include_directories(ggma_tokenize PRIVATE ${SentencePiece_INCLUDE_DIRS})
# Add explicit dependency to ensure SentencePiece is built and installed when ggma is built
add_dependencies(ggma_tokenize sentencepiece)

# Add log library for Android
if(ANDROID)
target_link_libraries(sentencepiece log)
endif()
endif()
43 changes: 43 additions & 0 deletions runtime/ggma/src/tokenize/Tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef __GGMA_TOKENIZE_TOKENIZER_H__
#define __GGMA_TOKENIZE_TOKENIZER_H__

#include <string>

namespace ggma
{

class Tokenizer
{
public:
virtual ~Tokenizer() = default;
virtual std::string id() const = 0;
virtual size_t tokenize(const char *text, size_t text_len, int32_t *tokens, size_t max_tokens,
size_t *n_tokens) const = 0;
virtual size_t detokenize(const int32_t *tokens, size_t n_tokens, char *text,
size_t text_len) const = 0;

protected:
Tokenizer() = default; // Protected constructor to enforce factory pattern
Tokenizer(const Tokenizer &) = delete;
Tokenizer &operator=(const Tokenizer &) = delete;
};

} // namespace ggma

#endif // __GGMA_TOKENIZE_TOKENIZER_H__
50 changes: 50 additions & 0 deletions runtime/ggma/src/tokenize/TokenizerFactory.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "TokenizerFactory.h"
#include "Tokenizer.h"

namespace ggma
{

TokenizerFactory &TokenizerFactory::getInstance()
{
static TokenizerFactory instance;
return instance;
}

Tokenizer *TokenizerFactory::create(const std::string &id, const std::string &package_path)
{
auto &factory = getInstance();
auto ctor_it = factory._ctors.find(id);
if (ctor_it != factory._ctors.end())
{
auto tokenizer = ctor_it->second(package_path);
if (tokenizer)
{
return tokenizer;
}
}
return nullptr;
}

void TokenizerFactory::add(const std::string &name, const Creator &ctor)
{
auto &factory = getInstance();
factory._ctors[name] = ctor;
}

} // namespace ggma
Loading