Overview
A high-performance library for managing arbitrarily large arrays of uniform bit-width records using memory mapping and bit-level packing. Designed to support permutation ring operations on billion-element datasets that exceed available RAM.
1. Core Data Structure
1.1 BitPackedArray Interface
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| template<typename IndexType = uint64_t>
class BitPackedArray {
private:
size_t element_bits_; // Bits per element (1-64)
IndexType count_; // Number of elements
size_t file_size_; // Total file size in bytes
void* mapped_memory_; // Memory-mapped region
int fd_; // File descriptor
public:
// Construction
BitPackedArray(const std::string& filename,
size_t element_bits,
IndexType count,
bool create_new = false);
// Element access
uint64_t get(IndexType index) const;
void set(IndexType index, uint64_t value);
// Swap operations - first class citizens
void swap(IndexType i, IndexType j);
void swap_ranges(IndexType start1, IndexType start2, IndexType count);
void swap_with_array(BitPackedArray& other, IndexType i, IndexType j);
// Optimized swap variants
void conditional_swap(IndexType i, IndexType j, bool condition);
void parallel_swap_ranges(IndexType start1, IndexType start2, IndexType count, size_t num_threads = 0);
// Bulk operations
void bulk_copy(IndexType dst_start, IndexType src_start, IndexType count);
void bulk_set(IndexType start, IndexType count, uint64_t value);
// Iterator support
class iterator;
iterator begin();
iterator end();
// Memory management
void sync(bool async = false);
void prefetch(IndexType start, IndexType count);
void advise_sequential();
void advise_random();
};
|
1.2 Bit Packing Mathematics and Swap Implementation
For element width w bits and index i:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
| // Bit-level addressing
size_t bit_offset = i * w;
size_t byte_offset = bit_offset / 8;
size_t bit_in_byte = bit_offset % 8;
// Optimized swap implementation
void swap(IndexType i, IndexType j) {
if (i == j) return; // No-op for self-swap
// Handle aligned cases efficiently
if (element_bits_ % 8 == 0 && is_byte_aligned(i) && is_byte_aligned(j)) {
fast_byte_aligned_swap(i, j);
return;
}
// General bit-packed swap
uint64_t val_i = get_bits_unsafe(i);
uint64_t val_j = get_bits_unsafe(j);
set_bits_unsafe(i, val_j);
set_bits_unsafe(j, val_i);
mark_pages_dirty(i, j);
}
// Optimized byte-aligned swap
void fast_byte_aligned_swap(IndexType i, IndexType j) {
size_t bytes_per_element = element_bits_ / 8;
uint8_t* ptr_i = mapped_bytes_ + i * bytes_per_element;
uint8_t* ptr_j = mapped_bytes_ + j * bytes_per_element;
// Use SIMD for larger elements
if (bytes_per_element >= 16) {
simd_swap_bytes(ptr_i, ptr_j, bytes_per_element);
} else {
// Unrolled swap for small elements
switch (bytes_per_element) {
case 1: std::swap(*ptr_i, *ptr_j); break;
case 2: std::swap(*reinterpret_cast<uint16_t*>(ptr_i),
*reinterpret_cast<uint16_t*>(ptr_j)); break;
case 4: std::swap(*reinterpret_cast<uint32_t*>(ptr_i),
*reinterpret_cast<uint32_t*>(ptr_j)); break;
case 8: std::swap(*reinterpret_cast<uint64_t*>(ptr_i),
*reinterpret_cast<uint64_t*>(ptr_j)); break;
default:
for (size_t k = 0; k < bytes_per_element; k++) {
std::swap(ptr_i[k], ptr_j[k]);
}
}
}
}
// Range swap with memory locality optimization
void swap_ranges(IndexType start1, IndexType start2, IndexType count) {
if (count == 0 || start1 == start2) return;
// Block-wise swapping for cache efficiency
const size_t BLOCK_SIZE = 1024; // Tuned for L1 cache
for (IndexType i = 0; i < count; i += BLOCK_SIZE) {
IndexType block_end = std::min(i + BLOCK_SIZE, count);
// Prefetch both regions
prefetch_range(start1 + i, block_end - i);
prefetch_range(start2 + i, block_end - i);
// Swap block elements
for (IndexType j = i; j < block_end; j++) {
swap(start1 + j, start2 + j);
}
}
}
// Handle cross-byte values
uint64_t get_bits(size_t byte_off, size_t bit_off, size_t width) {
if (bit_off + width <= 8) {
// Single byte case
uint8_t byte_val = mapped_bytes[byte_off];
uint8_t mask = (1 << width) - 1;
return (byte_val >> (8 - bit_off - width)) & mask;
} else {
// Multi-byte case - read up to 8 bytes and extract
uint64_t result = 0;
size_t bytes_needed = (bit_off + width + 7) / 8;
for (size_t j = 0; j < bytes_needed; j++) {
result = (result << 8) | mapped_bytes[byte_off + j];
}
size_t total_bits = bytes_needed * 8;
size_t shift = total_bits - bit_off - width;
uint64_t mask = (1ULL << width) - 1;
return (result >> shift) & mask;
}
}
|
2. Specialized Collections
2.1 SortableArray - External Sorting with First-Class Swaps
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
| template<typename IndexType = uint64_t>
class SortableArray : public BitPackedArray<IndexType> {
private:
size_t cache_size_; // RAM cache size
std::string temp_dir_; // Temporary file directory
mutable std::atomic<uint64_t> swap_count_{0}; // Performance tracking
public:
// Swap-based sorting algorithms
void quicksort_external(IndexType start, IndexType end);
void heapsort_external();
void introsort_external();
// External merge sort with swap optimization
void external_sort(std::function<bool(uint64_t, uint64_t)> comparator);
// Swap-optimized partitioning
IndexType partition_three_way(IndexType start, IndexType end, uint64_t pivot);
IndexType partition_hoare(IndexType start, IndexType end,
std::function<bool(uint64_t, uint64_t)> comp);
// K-way merge from multiple sorted arrays
static SortableArray merge_k_ways(
const std::vector<SortableArray*>& arrays,
const std::string& output_file
);
// Parallel sort using multiple threads/processes
void parallel_sort(size_t num_threads = 0);
// Swap-aware binary search
IndexType binary_search(uint64_t target) const;
std::pair<IndexType, IndexType> equal_range(uint64_t target) const;
// Performance metrics
uint64_t get_swap_count() const { return swap_count_.load(); }
void reset_swap_count() { swap_count_.store(0); }
private:
// Optimized swap with counting
void tracked_swap(IndexType i, IndexType j) {
if (i != j) {
this->swap(i, j);
swap_count_.fetch_add(1, std::memory_order_relaxed);
}
}
// Cache-aware quicksort implementation
void quicksort_cached(IndexType start, IndexType end, size_t depth_limit) {
if (end - start < 32) {
insertion_sort_range(start, end);
return;
}
if (depth_limit == 0) {
heapsort_range(start, end);
return;
}
// Three-way partitioning to handle duplicates
auto [lt, gt] = three_way_partition(start, end);
// Recurse on smaller partition first (stack optimization)
if (lt - start < end - gt) {
quicksort_cached(start, lt, depth_limit - 1);
quicksort_cached(gt, end, depth_limit - 1);
} else {
quicksort_cached(gt, end, depth_limit - 1);
quicksort_cached(start, lt, depth_limit - 1);
}
}
// Insertion sort for small ranges
void insertion_sort_range(IndexType start, IndexType end) {
for (IndexType i = start + 1; i < end; i++) {
uint64_t key = this->get(i);
IndexType j = i;
while (j > start && this->get(j - 1) > key) {
tracked_swap(j, j - 1);
j--;
}
}
}
// Three-way partitioning (Dijkstra's Dutch flag)
std::pair<IndexType, IndexType> three_way_partition(IndexType start, IndexType end) {
uint64_t pivot = this->get(start);
IndexType lt = start;
IndexType i = start + 1;
IndexType gt = end;
while (i < gt) {
uint64_t val = this->get(i);
if (val < pivot) {
tracked_swap(lt++, i++);
} else if (val > pivot) {
tracked_swap(i, --gt);
} else {
i++;
}
}
return {lt, gt};
}
};
|
2.2 PermutationArray - Swap-Optimized Permutation Operations
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
| template<typename IndexType = uint64_t>
class PermutationArray : public BitPackedArray<IndexType> {
private:
mutable std::atomic<uint64_t> swap_operations_{0};
public:
// Apply permutation: result[i] = source[perm[i]]
void apply_permutation(const BitPackedArray<IndexType>& source,
BitPackedArray<IndexType>& result) const;
// In-place permutation application using swaps
void apply_permutation_inplace(BitPackedArray<IndexType>& target) const;
// Compose permutations: result[i] = perm2[perm1[i]]
void compose(const PermutationArray<IndexType>& other,
PermutationArray<IndexType>& result) const;
// Compute inverse permutation using swap-based algorithm
void invert(PermutationArray<IndexType>& result) const;
void invert_inplace();
// Check if valid permutation
bool is_valid_permutation() const;
// Cycle decomposition and manipulation
std::vector<std::vector<IndexType>> get_cycles() const;
void apply_cycle_decomposition_swaps(const std::vector<std::vector<IndexType>>& cycles);
void random_shuffle_with_swaps(size_t num_swaps = 0);
// Optimized permutation sorting
void sort_by_permutation_swaps(const std::function<bool(IndexType, IndexType)>& comp);
// Transposition operations
void apply_transposition(IndexType i, IndexType j); // Single swap
void apply_transpositions(const std::vector<std::pair<IndexType, IndexType>>& swaps);
// Bubble sort distance computation
IndexType bubble_sort_distance(const PermutationArray<IndexType>& other) const;
std::vector<std::pair<IndexType, IndexType>> minimal_swap_sequence(
const PermutationArray<IndexType>& target) const;
// Performance tracking
uint64_t get_swap_operations() const { return swap_operations_.load(); }
void reset_swap_counter() { swap_operations_.store(0); }
private:
void tracked_swap(IndexType i, IndexType j) {
if (i != j) {
this->swap(i, j);
swap_operations_.fetch_add(1, std::memory_order_relaxed);
}
}
// Cycle-following algorithm for in-place permutation
void apply_cycles_inplace(BitPackedArray<IndexType>& target,
const std::vector<std::vector<IndexType>>& cycles) const {
std::vector<bool> visited(this->size(), false);
for (const auto& cycle : cycles) {
if (cycle.size() < 2) continue;
// Apply cycle using minimal swaps
for (size_t i = 0; i < cycle.size() - 1; i++) {
target.swap(cycle[i], cycle[i + 1]);
}
}
}
};
|
3. Memory Management Strategy
3.1 Adaptive Paging
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
| class AdaptivePager {
private:
struct PageInfo {
void* addr;
size_t size;
uint64_t last_access;
uint32_t access_count;
bool dirty;
};
std::unordered_map<size_t, PageInfo> active_pages_;
size_t max_resident_size_;
public:
// Intelligent page management
void* get_page(size_t offset, size_t size);
void mark_dirty(size_t offset, size_t size);
void flush_clean_pages();
void prefetch_sequential(size_t start, size_t length);
// Access pattern analysis
void analyze_access_pattern();
void optimize_for_sequential();
void optimize_for_random();
};
|
3.2 NUMA-Aware Allocation
1
2
3
4
5
6
7
8
9
10
11
12
| class NumaAllocator {
public:
// Bind memory to specific NUMA nodes
void* mmap_numa(size_t size, int node_id);
// Distribute large arrays across NUMA nodes
void distribute_array(BitPackedArray& array,
const std::vector<int>& node_ids);
// Migrate pages to optimal NUMA nodes
void migrate_to_local_node(void* addr, size_t size);
};
|
4. External Sorting Algorithms
4.1 Multi-Way Merge Sort
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
| template<typename IndexType>
class ExternalMergeSorter {
private:
size_t memory_limit_;
size_t merge_factor_;
std::string temp_directory_;
public:
void sort(SortableArray<IndexType>& array) {
// Phase 1: Create sorted runs
std::vector<std::string> run_files = create_sorted_runs(array);
// Phase 2: Multi-way merge
while (run_files.size() > 1) {
run_files = merge_pass(run_files);
}
// Phase 3: Copy back to original array
copy_final_result(run_files[0], array);
}
private:
std::vector<std::string> create_sorted_runs(SortableArray<IndexType>& array) {
std::vector<std::string> runs;
size_t elements_per_run = memory_limit_ / array.element_size_bytes();
for (IndexType start = 0; start < array.size(); start += elements_per_run) {
IndexType end = std::min(start + elements_per_run, array.size());
// Load chunk into memory
std::vector<uint64_t> chunk;
for (IndexType i = start; i < end; i++) {
chunk.push_back(array.get(i));
}
// Sort in memory
std::sort(chunk.begin(), chunk.end());
// Write sorted run to disk
std::string run_file = create_temp_file();
write_run(run_file, chunk);
runs.push_back(run_file);
}
return runs;
}
std::vector<std::string> merge_pass(const std::vector<std::string>& inputs) {
std::vector<std::string> outputs;
for (size_t i = 0; i < inputs.size(); i += merge_factor_) {
size_t end = std::min(i + merge_factor_, inputs.size());
std::vector<std::string> batch(inputs.begin() + i, inputs.begin() + end);
std::string output_file = create_temp_file();
merge_k_files(batch, output_file);
outputs.push_back(output_file);
}
return outputs;
}
};
|
4.2 Parallel External Sort
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
| template<typename IndexType>
class ParallelExternalSorter {
private:
struct WorkerThread {
std::thread thread;
std::queue<SortTask> tasks;
std::mutex task_mutex;
std::condition_variable task_cv;
bool running;
};
std::vector<WorkerThread> workers_;
public:
void parallel_sort(SortableArray<IndexType>& array, size_t num_threads) {
// Partition array across threads
std::vector<ArrayPartition> partitions = partition_array(array, num_threads);
// Sort partitions in parallel
std::vector<std::future<std::string>> futures;
for (const auto& partition : partitions) {
futures.push_back(std::async(std::launch::async,
[this](const ArrayPartition& p) {
return sort_partition(p);
}, partition));
}
// Collect sorted runs
std::vector<std::string> sorted_runs;
for (auto& future : futures) {
sorted_runs.push_back(future.get());
}
// Final merge
std::string final_file = merge_k_files(sorted_runs, "final_sort.tmp");
copy_back_to_array(final_file, array);
}
};
|
5.1 Cache-Conscious Design
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
| class CacheOptimizedAccess {
private:
static constexpr size_t CACHE_LINE_SIZE = 64;
static constexpr size_t L1_CACHE_SIZE = 32 * 1024;
static constexpr size_t L2_CACHE_SIZE = 256 * 1024;
public:
// Prefetch multiple cache lines
void prefetch_range(const void* addr, size_t size) {
const char* ptr = static_cast<const char*>(addr);
for (size_t i = 0; i < size; i += CACHE_LINE_SIZE) {
__builtin_prefetch(ptr + i, 0, 3); // Temporal locality
}
}
// Block-wise operations for cache efficiency
template<typename Op>
void blocked_operation(BitPackedArray& array, Op operation, size_t block_size = L2_CACHE_SIZE) {
for (size_t i = 0; i < array.size(); i += block_size) {
size_t end = std::min(i + block_size, array.size());
operation(i, end);
}
}
};
|
5.2 SIMD-Accelerated Swap Operations
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
| class SIMDSwapOperations {
public:
// Vectorized byte swapping
static void simd_swap_bytes(uint8_t* ptr1, uint8_t* ptr2, size_t bytes) {
if (bytes >= 32 && is_avx2_available()) {
simd_swap_avx2(ptr1, ptr2, bytes);
} else if (bytes >= 16) {
simd_swap_sse(ptr1, ptr2, bytes);
} else {
// Fallback to scalar swapping
for (size_t i = 0; i < bytes; i++) {
std::swap(ptr1[i], ptr2[i]);
}
}
}
private:
static void simd_swap_avx2(uint8_t* ptr1, uint8_t* ptr2, size_t bytes) {
size_t simd_bytes = (bytes / 32) * 32;
for (size_t i = 0; i < simd_bytes; i += 32) {
__m256i vec1 = _mm256_loadu_si256((__m256i*)(ptr1 + i));
__m256i vec2 = _mm256_loadu_si256((__m256i*)(ptr2 + i));
_mm256_storeu_si256((__m256i*)(ptr1 + i), vec2);
_mm256_storeu_si256((__m256i*)(ptr2 + i), vec1);
}
// Handle remaining bytes
for (size_t i = simd_bytes; i < bytes; i++) {
std::swap(ptr1[i], ptr2[i]);
}
}
static void simd_swap_sse(uint8_t* ptr1, uint8_t* ptr2, size_t bytes) {
size_t simd_bytes = (bytes / 16) * 16;
for (size_t i = 0; i < simd_bytes; i += 16) {
__m128i vec1 = _mm_loadu_si128((__m128i*)(ptr1 + i));
__m128i vec2 = _mm_loadu_si128((__m128i*)(ptr2 + i));
_mm_storeu_si128((__m128i*)(ptr1 + i), vec2);
_mm_storeu_si128((__m128i*)(ptr2 + i), vec1);
}
// Handle remaining bytes
for (size_t i = simd_bytes; i < bytes; i++) {
std::swap(ptr1[i], ptr2[i]);
}
}
public:
// Vectorized comparison for conditional swaps
static void simd_conditional_swap_array(uint64_t* array, size_t size,
std::function<bool(uint64_t, uint64_t)> should_swap);
// Parallel bit extraction with swap capability
static void simd_extract_and_swap_bits(const uint8_t* packed_data,
uint64_t* output,
size_t element_bits,
size_t count,
const std::vector<std::pair<size_t, size_t>>& swap_pairs);
// Vectorized permutation application with swap tracking
static void simd_apply_permutation_with_swaps(const uint64_t* source,
uint64_t* dest,
const uint64_t* perm,
size_t count,
std::atomic<uint64_t>& swap_counter);
// Batch swap operations
static void simd_batch_swap(BitPackedArray& array,
const std::vector<std::pair<uint64_t, uint64_t>>& swap_pairs);
};
|
6. Integration with Permutation Ring System
6.1 Usage Example with Swap-Optimized Operations
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
| // Create arrays for 500M suffix positions (29 bits each)
SortableArray<uint64_t> suffix_array("suffix_array.dat", 29, 500000000, true);
PermutationArray<uint64_t> forward_perm("forward.dat", 29, 500000000, true);
PermutationArray<uint64_t> backward_perm("backward.dat", 29, 500000000, true);
BitPackedArray<uint32_t> lcp_array("lcp.dat", 20, 500000000, true); // 20 bits for LCP
// Build suffix array using swap-optimized external sort
auto compare_suffixes = [&](uint64_t a, uint64_t b) {
return suffix_compare(text, a, b);
};
suffix_array.external_sort(compare_suffixes);
std::cout << "Suffix array sorted with " << suffix_array.get_swap_count()
<< " swaps\n";
// Build movement permutations using swap-based algorithms
build_movement_permutations_with_swaps(suffix_array, forward_perm, backward_perm);
// Compute LCP array efficiently with swap optimization
compute_lcp_external_swapped(suffix_array, lcp_array);
// Example: Apply a series of transpositions to reorder the permutation
std::vector<std::pair<uint64_t, uint64_t>> transpositions = {
{100000, 200000}, {50000, 150000}, {300000, 400000}
};
forward_perm.apply_transpositions(transpositions);
// Track swap efficiency
std::cout << "Permutation operations used " << forward_perm.get_swap_operations()
<< " swaps\n";
// Swap-optimized range operations
suffix_array.swap_ranges(0, 250000000, 1000); // Swap two 1000-element regions
// Conditional swaps based on permutation ring distance
for (size_t i = 0; i < suffix_array.size() - 1; i++) {
uint64_t val_i = suffix_array.get(i);
uint64_t val_j = suffix_array.get(i + 1);
// Only swap if it improves some criterion
bool should_swap = evaluate_swap_benefit(val_i, val_j);
suffix_array.conditional_swap(i, i + 1, should_swap);
}
|
6.2 Memory Usage Analysis
For 500M elements:
- Suffix positions (29 bits): 1.8 GB
- Forward/backward permutations (29 bits each): 3.6 GB
- LCP values (20 bits): 1.25 GB
- Total: ~6.7 GB on disk, manageable RAM usage through paging
7.1 Linux-Specific Features
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| #ifdef __linux__
// Use fallocate for fast file preallocation
fallocate(fd, 0, 0, file_size);
// Huge page support
mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_HUGETLB, fd, 0);
// Readahead for sequential access
readahead(fd, offset, length);
// NUMA binding
mbind(addr, size, MPOL_BIND, &node_mask, maxnode, 0);
#endif
|
1
2
3
4
5
6
7
8
| class PlatformAbstraction {
public:
static void* create_mapping(int fd, size_t size, size_t offset);
static void destroy_mapping(void* addr, size_t size);
static void sync_mapping(void* addr, size_t size, bool async);
static void prefetch_mapping(void* addr, size_t size);
static void set_access_pattern(void* addr, size_t size, AccessPattern pattern);
};
|
This library enables your permutation ring system to scale to truly massive datasets while maintaining high performance through careful memory management and external algorithm design.
8. Bzip2 Integration: Index-While-Decompressing (Advanced Topic)
8.1 Exploiting Bzip2’s Internal BWT
Bzip2 uses the Burrows-Wheeler Transform internally, making it possible to extract permutation ring data directly during decompression without recomputing the expensive suffix sorting.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
| class Bzip2IndexBuilder {
private:
struct Bzip2Block {
uint32_t block_size;
uint32_t original_ptr; // Points to original first character
uint8_t* bwt_data; // The BWT-transformed block
uint32_t* suffix_array; // Reconstructed from BWT
bool index_built;
};
std::vector<Bzip2Block> blocks_;
SortableArray<uint64_t> global_suffix_array_;
BitPackedArray<uint32_t> global_lcp_array_;
public:
// Build index while decompressing bzip2 file
std::unique_ptr<PermutationIndex> decompress_and_index(
const std::string& bzip2_file,
const std::string& index_prefix
);
// Extract suffix array from BWT without full decompression
void extract_suffix_arrays_only(const std::string& bzip2_file);
private:
// Hook into bzip2 decompression to capture BWT data
static int bzip2_read_hook(void* opaque, void* buffer, int size);
static void bzip2_block_hook(void* opaque,
uint8_t* bwt_block,
uint32_t block_size,
uint32_t original_ptr);
};
// Modified bzip2 decompression that exposes internal state
class IndexingBZ2Decoder {
private:
bz_stream stream_;
Bzip2IndexBuilder* index_builder_;
// BWT reconstruction state
uint32_t* tt_; // Transformation table
uint32_t* cftab_; // Character frequency cumulative table
uint8_t* ll8_; // Working space for BWT inversion
public:
IndexingBZ2Decoder(Bzip2IndexBuilder* builder)
: index_builder_(builder) {
BZ2_bzDecompressInit(&stream_, 0, 0);
}
// Decompress while building suffix array from BWT
int decompress_block_with_indexing(
uint8_t* input, uint32_t input_size,
uint8_t* output, uint32_t* output_size
) {
// Standard bzip2 block header parsing
uint32_t block_size, original_ptr;
parse_bzip2_block_header(input, &block_size, &original_ptr);
// Extract BWT data directly
uint8_t* bwt_data = input + BZIP2_BLOCK_HEADER_SIZE;
// Build suffix array from BWT using existing transformation table
uint32_t* suffix_array = reconstruct_suffix_array_from_bwt(
bwt_data, block_size, original_ptr
);
// Add to global index
index_builder_->add_block_suffix_array(suffix_array, block_size,
get_global_offset());
// Continue with normal decompression
return BZ2_bzDecompress(&stream_);
}
private:
// Reconstruct suffix array from BWT without full string reconstruction
uint32_t* reconstruct_suffix_array_from_bwt(
uint8_t* bwt_data, uint32_t block_size, uint32_t original_ptr
) {
// Use bzip2's internal transformation table reconstruction
build_transformation_table(bwt_data, block_size);
// Extract suffix array positions directly from tt_ table
uint32_t* suffix_array = new uint32_t[block_size];
uint32_t pos = original_ptr;
for (uint32_t i = 0; i < block_size; i++) {
suffix_array[i] = pos;
pos = tt_[pos];
}
return suffix_array;
}
// Build the transformation table (standard bzip2 algorithm)
void build_transformation_table(uint8_t* bwt_data, uint32_t block_size) {
// Character frequency counting
memset(cftab_, 0, 257 * sizeof(uint32_t));
for (uint32_t i = 0; i < block_size; i++) {
cftab_[bwt_data[i]]++;
}
// Convert to cumulative frequencies
for (int32_t i = 1; i < 257; i++) {
cftab_[i] += cftab_[i-1];
}
// Build transformation table
for (int32_t i = block_size - 1; i >= 0; i--) {
uint8_t ch = bwt_data[i];
tt_[--cftab_[ch]] = i;
}
}
};
|
8.2 Block-Level Index Merging
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
| class BlockIndexMerger {
private:
struct BlockIndex {
uint32_t* suffix_array;
uint32_t block_size;
uint64_t global_offset; // Position in full file
uint32_t* lcp_array; // Computed for this block
bool is_sorted;
};
std::vector<BlockIndex> block_indices_;
public:
// Merge block-level suffix arrays into global index
void merge_block_indices(const std::string& output_prefix) {
// Sort blocks by global offset
std::sort(block_indices_.begin(), block_indices_.end(),
[](const BlockIndex& a, const BlockIndex& b) {
return a.global_offset < b.global_offset;
});
// Compute global suffix array size
uint64_t total_size = 0;
for (const auto& block : block_indices_) {
total_size += block.block_size;
}
// Create global arrays
SortableArray<uint64_t> global_sa(output_prefix + "_sa.dat",
compute_bits_needed(total_size),
total_size, true);
// Merge using k-way merge algorithm
merge_k_sorted_blocks(global_sa);
// Build movement permutations from merged suffix array
build_movement_permutations_from_merged(global_sa);
}
private:
// K-way merge of sorted block suffix arrays
void merge_k_sorted_blocks(SortableArray<uint64_t>& global_sa) {
// Priority queue for k-way merge
using MergeNode = std::tuple<uint64_t, size_t, size_t>; // value, block_idx, pos_in_block
std::priority_queue<MergeNode, std::vector<MergeNode>, std::greater<>> pq;
// Initialize with first element from each block
for (size_t i = 0; i < block_indices_.size(); i++) {
if (block_indices_[i].block_size > 0) {
uint64_t global_pos = block_indices_[i].global_offset +
block_indices_[i].suffix_array[0];
pq.emplace(global_pos, i, 0);
}
}
// Merge into global suffix array
uint64_t output_pos = 0;
while (!pq.empty()) {
auto [value, block_idx, pos_in_block] = pq.top();
pq.pop();
global_sa.set(output_pos++, value);
// Add next element from the same block
if (pos_in_block + 1 < block_indices_[block_idx].block_size) {
uint64_t next_pos = block_indices_[block_idx].global_offset +
block_indices_[block_idx].suffix_array[pos_in_block + 1];
pq.emplace(next_pos, block_idx, pos_in_block + 1);
}
}
}
};
|
8.3 Streaming Index Construction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
| class StreamingBzip2Indexer {
private:
std::unique_ptr<IndexingBZ2Decoder> decoder_;
std::unique_ptr<BlockIndexMerger> merger_;
std::string temp_dir_;
size_t processed_blocks_;
public:
// Stream processing of large bzip2 files
void index_large_bzip2_stream(std::istream& input,
const std::string& index_prefix) {
const size_t BUFFER_SIZE = 1024 * 1024; // 1MB buffer
std::vector<uint8_t> input_buffer(BUFFER_SIZE);
std::vector<uint8_t> output_buffer(BUFFER_SIZE * 4); // Decompression ratio
while (!input.eof()) {
input.read(reinterpret_cast<char*>(input_buffer.data()), BUFFER_SIZE);
uint32_t bytes_read = input.gcount();
if (bytes_read == 0) break;
uint32_t output_size = output_buffer.size();
int result = decoder_->decompress_block_with_indexing(
input_buffer.data(), bytes_read,
output_buffer.data(), &output_size
);
// Process any completed blocks
if (result == BZ_STREAM_END || processed_blocks_ % 100 == 0) {
// Periodic merging to avoid memory exhaustion
merger_->partial_merge_and_flush();
}
processed_blocks_++;
}
// Final merge
merger_->merge_block_indices(index_prefix);
}
// Memory-efficient processing for very large files
void index_with_external_merge(const std::string& bzip2_file,
const std::string& index_prefix,
size_t memory_limit = 2ULL * 1024 * 1024 * 1024) { // 2GB default
size_t blocks_per_batch = memory_limit / (sizeof(uint32_t) * AVERAGE_BLOCK_SIZE);
std::vector<std::string> temp_index_files;
size_t batch_count = 0;
// Process in batches
while (has_more_blocks()) {
std::string batch_file = temp_dir_ + "/batch_" + std::to_string(batch_count++) + ".idx";
process_batch(blocks_per_batch, batch_file);
temp_index_files.push_back(batch_file);
}
// External merge of batch indices
external_merge_index_files(temp_index_files, index_prefix);
}
};
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
| class Bzip2IndexOptimizer {
public:
// Parallel block processing
static void parallel_block_indexing(
const std::vector<std::string>& bzip2_files,
const std::string& merged_index_prefix,
size_t num_threads = 0
) {
if (num_threads == 0) {
num_threads = std::thread::hardware_concurrency();
}
std::vector<std::future<std::string>> futures;
// Process each file in parallel
for (const auto& file : bzip2_files) {
futures.push_back(std::async(std::launch::async,
[](const std::string& bzip2_file) {
StreamingBzip2Indexer indexer;
std::string temp_index = create_temp_index_name();
std::ifstream input(bzip2_file, std::ios::binary);
indexer.index_large_bzip2_stream(input, temp_index);
return temp_index;
}, file));
}
// Collect results and merge
std::vector<std::string> temp_indices;
for (auto& future : futures) {
temp_indices.push_back(future.get());
}
// Final k-way merge of all indices
SortableArray<uint64_t>::merge_k_ways_from_files(
temp_indices, merged_index_prefix
);
}
// Cache-aware BWT inversion
static void optimized_bwt_to_suffix_array(
const uint8_t* bwt_data, uint32_t block_size,
uint32_t original_ptr, uint32_t* suffix_array
) {
// Use cache-blocking for large blocks
const size_t BLOCK_SIZE = 4096; // Cache-friendly block size
if (block_size > BLOCK_SIZE * 4) {
blocked_bwt_inversion(bwt_data, block_size, original_ptr, suffix_array);
} else {
standard_bwt_inversion(bwt_data, block_size, original_ptr, suffix_array);
}
}
private:
static void blocked_bwt_inversion(
const uint8_t* bwt_data, uint32_t block_size,
uint32_t original_ptr, uint32_t* suffix_array
) {
// Process BWT inversion in cache-friendly blocks
// This reduces memory bandwidth and improves performance
// for large blocks common in bzip2 files
std::vector<uint32_t> block_cftab(257, 0);
std::vector<uint32_t> block_tt(block_size);
// Build frequency table in blocks
for (uint32_t start = 0; start < block_size; start += BLOCK_SIZE) {
uint32_t end = std::min(start + BLOCK_SIZE, block_size);
for (uint32_t i = start; i < end; i++) {
block_cftab[bwt_data[i]]++;
}
}
// Convert to cumulative and build transformation table
for (int32_t i = 1; i < 257; i++) {
block_cftab[i] += block_cftab[i-1];
}
for (int32_t i = block_size - 1; i >= 0; i--) {
uint8_t ch = bwt_data[i];
block_tt[--block_cftab[ch]] = i;
}
// Extract suffix array
uint32_t pos = original_ptr;
for (uint32_t i = 0; i < block_size; i++) {
suffix_array[i] = pos;
pos = block_tt[pos];
}
}
};
|
This approach is incredibly efficient because:
Zero Redundant Computation: We’re reusing the BWT that bzip2 already computed, avoiding the expensive $O(n \log n)$ suffix sorting step.
Streaming Processing: Can handle arbitrarily large compressed files without loading everything into memory.
Block-Level Parallelism: Each bzip2 block can be processed independently, enabling parallel indexing.
Memory Efficiency: Only need to store suffix arrays for current batch of blocks, not the entire decompressed file.
Cache Optimization: The block structure of bzip2 naturally provides cache-friendly access patterns.
For a 1GB bzip2 file, this could build the full permutation ring index in a fraction of the time needed for plaintext indexing, while using minimal additional memory beyond what’s needed for decompression!# Memory-Mapped Bit-Packed Array Library