datactl/monikers.cpp
datactl/monikers.cpp
Namespaces
| Name |
|---|
| Syntalos |
Source code
/*
* Copyright (C) 2025-2026 Matthias Klumpp <matthias@tenstral.net>
*
* Licensed under the GNU Lesser General Public License Version 3
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the license, or
* (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this software. If not, see <http://www.gnu.org/licenses/>.
*/
#include "monikers.h"
#include <algorithm>
#include <chrono>
#include <mutex>
#include <random>
#include <string>
#include <string_view>
#include <vector>
#include <xxhash.h>
namespace Syntalos
{
namespace
{
// Word lists are embedded verbatim from data/words/*.txt as raw byte blobs.
// We avoid building a compile-time table of string_views (which would bake
// ~16 bytes per entry into the binary) and instead build a compact uint32_t
// offset table lazily on first use.
#ifdef __has_embed
constexpr unsigned char adjectives_data[] = {
#embed "words/adjectives.txt"
};
constexpr unsigned char animals_data[] = {
#embed "words/animals.txt"
};
constexpr unsigned char intermediate_data[] = {
#embed "words/intermediate.txt"
};
constexpr unsigned char nouns_data[] = {
#embed "words/nouns.txt"
};
#else
#include "words_embed.h"
#endif
class WordList
{
public:
WordList(const unsigned char *data, std::size_t size) noexcept
: m_blob(reinterpret_cast<const char *>(data), size)
{
}
std::size_t size() const
{
ensureBuilt();
return m_starts.size();
}
std::string_view at(std::size_t idx) const
{
ensureBuilt();
const uint32_t start = m_starts[idx];
const uint32_t next = (idx + 1 < m_starts.size()) ? m_starts[idx + 1]
: static_cast<uint32_t>(m_blob.size() + 1);
// next points one past the line separator; strip it (and a CR if present).
std::size_t end = next - 1;
if (end > start && m_blob[end - 1] == '\r')
--end;
return m_blob.substr(start, end - start);
}
private:
void ensureBuilt() const
{
std::call_once(m_built, [this] {
std::size_t lines = 0;
for (char c : m_blob)
if (c == '\n')
++lines;
const bool trailing = !m_blob.empty() && m_blob.back() != '\n';
if (trailing)
++lines;
m_starts.reserve(lines);
if (!m_blob.empty())
m_starts.push_back(0);
for (std::size_t i = 0; i + 1 < m_blob.size(); ++i) {
if (m_blob[i] == '\n')
m_starts.push_back(static_cast<uint32_t>(i + 1));
}
});
}
std::string_view m_blob;
mutable std::vector<uint32_t> m_starts;
mutable std::once_flag m_built;
};
WordList &adjectives()
{
static WordList list(adjectives_data, sizeof(adjectives_data));
return list;
}
WordList &animals()
{
static WordList list(animals_data, sizeof(animals_data));
return list;
}
WordList &intermediates()
{
static WordList list(intermediate_data, sizeof(intermediate_data));
return list;
}
WordList &nouns()
{
static WordList list(nouns_data, sizeof(nouns_data));
return list;
}
std::mt19937_64 &threadRng()
{
static thread_local std::mt19937_64 rng{
std::random_device{}() ^ static_cast<uint64_t>(std::chrono::steady_clock::now().time_since_epoch().count())};
return rng;
}
} // namespace
static std::string makeAnimalMonikerFromRng(std::mt19937_64 &rng)
{
auto &adj = adjectives();
auto &ani = animals();
std::uniform_int_distribution<std::size_t> aDist(0, adj.size() - 1);
std::uniform_int_distribution<std::size_t> nDist(0, ani.size() - 1);
const auto a = adj.at(aDist(rng));
const auto n = ani.at(nDist(rng));
std::string out;
out.reserve(a.size() + 1 + n.size());
out.append(a).append("-").append(n);
// Word entries may contain spaces (e.g. "guinea pig"); flatten them.
std::replace(out.begin(), out.end(), ' ', '-');
return out;
}
static std::string makeMonikerFromSeed(uint64_t seed)
{
std::mt19937_64 rng(seed);
auto &adj = adjectives();
auto &mid = intermediates();
auto &nn = nouns();
std::uniform_int_distribution<std::size_t> aDist(0, adj.size() - 1);
std::uniform_int_distribution<std::size_t> mDist(0, mid.size() - 1);
std::uniform_int_distribution<std::size_t> nDist(0, nn.size() - 1);
// ~25% of monikers get an intermediate word inserted
std::uniform_int_distribution<int> useMid(0, 3);
const auto a = adj.at(aDist(rng));
const bool withMid = useMid(rng) == 0;
const auto m = withMid ? mid.at(mDist(rng)) : std::string_view{};
const auto n = nn.at(nDist(rng));
std::string out;
out.reserve(a.size() + n.size() + (withMid ? m.size() + 2 : 1));
out.append(a).append("-");
if (withMid)
out.append(m).append("-");
out.append(n);
std::replace(out.begin(), out.end(), ' ', '-');
return out;
}
std::string makeAnimalMoniker()
{
return makeAnimalMonikerFromRng(threadRng());
}
std::string makeAnimalMonikerForString(const std::string &source)
{
std::mt19937_64 rng(XXH3_64bits(source.data(), source.size()));
return makeAnimalMonikerFromRng(rng);
}
std::string makeMonikerForUuid(const Uuid &uuid)
{
return makeMonikerFromSeed(XXH3_64bits(uuid.bytes.data(), uuid.bytes.size()));
}
std::string makeMonikerForString(const std::string &source)
{
return makeMonikerFromSeed(XXH3_64bits(source.data(), source.size()));
}
} // namespace Syntalos
Updated on 2026-05-09 at 19:46:31 +0000