Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packaging/nnfw.spec
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ GGMA package for on-device generative AI framework
%defattr(-,root,root,-)
%ifarch arm armv7l armv7hl aarch64 x86_64 %ix86 riscv64
%{_libdir}/ggma/libggma_api.so
%{_libdir}/ggma/libggma_tokenize.so
%{_includedir}/ggma/*
%{_libdir}/pkgconfig/ggma.pc
%endif
Expand Down
13 changes: 12 additions & 1 deletion runtime/ggma/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ if (NOT BUILD_GGMA_API)
endif()

file(GLOB_RECURSE API_SRC "src/*.cc")
list(FILTER API_SRC EXCLUDE REGEX "src/tokenize/.*")

set(GGMA_DEV ggma-dev)
add_library(${GGMA_DEV} SHARED ${API_SRC})

# Public headers to publish
set(GGMA_API_HEADERS include/ggma_api.h include/ggma_types.h
include/ggma_context.h include/ggma_generate.h)
include/ggma_context.h include/ggma_generate.h
include/ggma_tokenize.h )

# GGMA install directory (same as ONERT_INSTALL_APIDIR)
set(GGMA_INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR}/ggma)
Expand All @@ -18,6 +20,11 @@ set(GGMA_INSTALL_APIDIR ${CMAKE_INSTALL_LIBDIR}/ggma)
target_link_libraries(${GGMA_DEV} PRIVATE jsoncpp ${LIB_PTHREAD})
target_link_libraries(${GGMA_DEV} PRIVATE nnfw-dev)

# Add tokenize subdirectory
add_subdirectory(src/tokenize)

# Link tokenize library
target_link_libraries(${GGMA_DEV} PUBLIC ggma_tokenize)
# NOTE Below line is added to remove warning for android build
# It will be removed after android build uses gold linker
if (ANDROID)
Expand All @@ -38,6 +45,10 @@ install(TARGETS ${GGMA_DEV}
LIBRARY DESTINATION ${GGMA_INSTALL_APIDIR}
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ggma)

# Install ggma_tokenize library
install(TARGETS ggma_tokenize
LIBRARY DESTINATION ${GGMA_INSTALL_APIDIR})

# Install pkg-config file for GGMA API
configure_file(ggma.pc.in ggma.pc @ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggma.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
1 change: 1 addition & 0 deletions runtime/ggma/include/ggma_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
/* Include all GGMA API headers */
#include "ggma_context.h"
#include "ggma_generate.h"
#include "ggma_tokenize.h"

#ifdef __cplusplus
extern "C" {
Expand Down
104 changes: 104 additions & 0 deletions runtime/ggma/include/ggma_tokenize.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* @file ggma_Tokenizer.h
* @brief This file defines the GGMA Tokenizer interface.
*/
#ifndef __GGMA_GGMA_TOKENIZE_H__
#define __GGMA_GGMA_TOKENIZE_H__

#include "ggma_types.h"

#ifdef __cplusplus
extern "C" {
#endif

/**
* @brief Opaque handle to a GGMA tokenizer.
*
* A GGMA tokenizer encapsulates all necessary components for text tokenization,
* including the tokenizer model and vocabulary.
*/
typedef struct ggma_tokenizer ggma_tokenizer;

/**
* @brief Creates a GGMA tokenizer from a specified tokenizer path.
*
* This function loads the necessary tokenizer components from the given tokenizer path
* and initializes a GGMA tokenizer handle.
*
* @param[out] tokenizer Pointer to the tokenizer object created from the given path
* @param[in] tokenizer_path The path to the directory containing the tokenizer model and
* vocabulary
* @return @c GGMA_STATUS_NO_ERROR on success, or an appropriate error code on failure
* (e.g., GGMA_STATUS_UNEXPECTED_NULL if @p tokenizer_path or @p tokenizer is NULL,
* or if the tokenizer cannot be created).
*/
GGMA_STATUS ggma_create_tokenizer(ggma_tokenizer **tokenizer, const char *tokenizer_path);

/**
* @brief Frees all resources associated with a GGMA tokenizer.
*
* @param[in] tokenizer The GGMA tokenizer to free. This handle will be invalid after the call.
* @return @c GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure.
*/
GGMA_STATUS ggma_free_tokenizer(ggma_tokenizer *tokenizer);

/**
* @brief Tokenizes an input text string into a sequence of token IDs.
*
* This function uses the vocabulary from the created tokenizer to convert
* the input text into a series of numerical token IDs.
*
* @param[in] tokenizer The GGMA tokenizer handle for tokenization.
* @param[in] text The null-terminated text string to be tokenized.
* @param[in] text_len The length of the text in bytes. If the text is null-terminated,
* this can be 0 and the length will be determined internally.
* @param[out] tokens Output buffer for generated token IDs.
* @param[in] n_tokens_max Maximum number of tokens the @p tokens buffer can hold.
* @param[out] n_tokens A pointer to a variable that will receive the actual number of
* tokens written to the @p tokens buffer.
* @return @c GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure
* (e.g., GGMA_STATUS_UNEXPECTED_NULL if @p tokenizer or @p text is NULL,
* or if the output buffer is too small).
*/
GGMA_STATUS ggma_tokenize(const ggma_tokenizer *tokenizer, const char *text, size_t text_len,
int32_t *tokens, size_t n_tokens_max, size_t *n_tokens);

/**
* @brief Detokenizes a sequence of token IDs back into a text string.
*
* This function uses the vocabulary from the created tokenizer to convert
* the sequence of token IDs back into a human-readable text string.
*
* @param[in] tokenizer The GGMA tokenizer handle for detokenization.
* @param[in] tokens A pointer to the input buffer containing the token IDs to be detokenized.
* @param[in] n_tokens The number of tokens in the @p tokens buffer.
* @param[out] text A pointer to the output buffer where the detokenized text will be stored.
* @param[in] text_len The maximum size of the @p text buffer in bytes.
* @return @c GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure
* (e.g., GGMA_STATUS_UNEXPECTED_NULL if @p tokenizer or @p tokens is NULL,
* or if the output buffer is too small).
*/
GGMA_STATUS ggma_detokenize(const ggma_tokenizer *tokenizer, const int32_t *tokens, size_t n_tokens,
char *text, size_t text_len);

#ifdef __cplusplus
}
#endif

#endif // __GGMA_GGMA_TOKENIZE_H__
97 changes: 97 additions & 0 deletions runtime/ggma/src/ggma_tokenize.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "ggma_tokenize.h"
#include "ggma_types.h"
#include "tokenize/TokenizerFactory.h"

#include <string>

extern "C" {

GGMA_STATUS ggma_create_tokenizer(ggma_tokenizer **tokenizer, const char *tokenizer_path)
{
if (!tokenizer || !tokenizer_path)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
std::string tokenizer_id = "sentencepiece";
auto impl = ggma::TokenizerFactory::create(tokenizer_id, tokenizer_path);

*tokenizer = reinterpret_cast<ggma_tokenizer *>(impl);
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

GGMA_STATUS ggma_free_tokenizer(ggma_tokenizer *tokenizer)
{
if (!tokenizer)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
auto impl = reinterpret_cast<ggma::Tokenizer *>(tokenizer);
delete impl;
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

GGMA_STATUS ggma_tokenize(const ggma_tokenizer *tokenizer, const char *text, size_t text_len,
int32_t *tokens, size_t n_tokens_max, size_t *n_tokens)
{
if (!tokenizer || !text || !tokens || !n_tokens)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
auto impl = reinterpret_cast<const ggma::Tokenizer *>(tokenizer);
impl->tokenize(text, text_len, tokens, n_tokens_max, n_tokens);
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

GGMA_STATUS ggma_detokenize(const ggma_tokenizer *tokenizer, const int32_t *tokens, size_t n_tokens,
char *text, size_t text_len)
{
if (!tokenizer || !tokens || !text)
return GGMA_STATUS_UNEXPECTED_NULL;

try
{
auto impl = reinterpret_cast<const ggma::Tokenizer *>(tokenizer);
impl->detokenize(tokens, n_tokens, text, text_len);
return GGMA_STATUS_NO_ERROR;
}
catch (...)
{
return GGMA_STATUS_ERROR;
}
}

} // extern "C"
30 changes: 30 additions & 0 deletions runtime/ggma/src/tokenize/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Tokenize module CMakeLists.txt
# Find SentencePiece if available
nnfw_find_package(SentencePiece QUIET)

set(TOKENIZE_SOURCES
Tokenizer.h
TokenizerFactory.h
TokenizerFactory.cc
)

if(SentencePiece_FOUND)
list(APPEND TOKENIZE_SOURCES TokenizerSentencePiece.cc)
endif()

# Create tokenize as a shared library
add_library(ggma_tokenize SHARED ${TOKENIZE_SOURCES})

target_include_directories(ggma_tokenize PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../include)

if(SentencePiece_FOUND)
target_link_libraries(ggma_tokenize PRIVATE ${SentencePiece_LIBRARIES})
target_include_directories(ggma_tokenize PRIVATE ${SentencePiece_INCLUDE_DIRS})
# Add explicit dependency to ensure SentencePiece is built and installed when ggma is built
add_dependencies(ggma_tokenize sentencepiece)

# Add log library for Android
if(ANDROID)
target_link_libraries(sentencepiece log)
endif()
endif()
43 changes: 43 additions & 0 deletions runtime/ggma/src/tokenize/Tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef __GGMA_TOKENIZE_TOKENIZER_H__
#define __GGMA_TOKENIZE_TOKENIZER_H__

#include <string>

namespace ggma
{

class Tokenizer
{
public:
virtual ~Tokenizer() = default;
virtual std::string id() const = 0;
virtual size_t tokenize(const char *text, size_t text_len, int32_t *tokens, size_t max_tokens,
size_t *n_tokens) const = 0;
virtual size_t detokenize(const int32_t *tokens, size_t n_tokens, char *text,
size_t text_len) const = 0;

protected:
Tokenizer() = default; // Protected constructor to enforce factory pattern
Tokenizer(const Tokenizer &) = delete;
Tokenizer &operator=(const Tokenizer &) = delete;
};

} // namespace ggma

#endif // __GGMA_TOKENIZE_TOKENIZER_H__
50 changes: 50 additions & 0 deletions runtime/ggma/src/tokenize/TokenizerFactory.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "TokenizerFactory.h"
#include "Tokenizer.h"

namespace ggma
{

TokenizerFactory &TokenizerFactory::getInstance()
{
static TokenizerFactory instance;
return instance;
}

Tokenizer *TokenizerFactory::create(const std::string &id, const std::string &package_path)
{
auto &factory = getInstance();
auto ctor_it = factory._ctors.find(id);
if (ctor_it != factory._ctors.end())
{
auto tokenizer = ctor_it->second(package_path);
if (tokenizer)
{
return tokenizer;
}
}
return nullptr;
}

void TokenizerFactory::add(const std::string &name, const Creator &ctor)
{
auto &factory = getInstance();
factory._ctors[name] = ctor;
}

} // namespace ggma
Loading