summaryrefslogtreecommitdiff
path: root/util/compress/libdeflate/lib/deflate_compress.c
blob: cf437982409ba5f3e62205e6c98e3fda6ee862dd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
/*
 * deflate_compress.c - a compressor for DEFLATE
 *
 * Originally public domain; changes after 2016-09-07 are copyrighted.
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "deflate_compress.h"
#include "deflate_constants.h"
#include "unaligned.h"

#include "libdeflate.h"

/*
 * By default, the near-optimal parsing algorithm is enabled at compression
 * level 8 and above.  The near-optimal parsing algorithm produces a compression
 * ratio significantly better than the greedy and lazy algorithms implemented
 * here, and also the algorithm used by zlib at level 9.  However, it is slow.
 */
#define SUPPORT_NEAR_OPTIMAL_PARSING 1

/*
 * Define to 1 to maintain the full map from match offsets to offset slots.
 * This slightly speeds up translations of match offsets to offset slots, but it
 * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
 * map.  The speedup provided by the larger map is most helpful when the
 * near-optimal parsing algorithm is being used.
 */
#define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING

/*
 * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
 * appropriately.
 */
#define MATCHFINDER_WINDOW_ORDER	15

#include "hc_matchfinder.h"
#if SUPPORT_NEAR_OPTIMAL_PARSING
#  include "bt_matchfinder.h"
#endif

/*
 * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
 * except if the last block has to be shorter.
 */
#define MIN_BLOCK_LENGTH	10000

/*
 * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
 * the final length might be slightly longer due to matches extending beyond
 * this limit.
 */
#define SOFT_MAX_BLOCK_LENGTH	300000

/*
 * The number of observed matches or literals that represents sufficient data to
 * decide whether the current block should be terminated or not.
 */
#define NUM_OBSERVATIONS_PER_BLOCK_CHECK       512


#if SUPPORT_NEAR_OPTIMAL_PARSING
/* Constants specific to the near-optimal parsing algorithm */

/*
 * The maximum number of matches the matchfinder can find at a single position.
 * Since the matchfinder never finds more than one match for the same length,
 * presuming one of each possible length is sufficient for an upper bound.
 * (This says nothing about whether it is worthwhile to consider so many
 * matches; this is just defining the worst case.)
 */
#  define MAX_MATCHES_PER_POS	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)

/*
 * The number of lz_match structures in the match cache, excluding the extra
 * "overflow" entries.  This value should be high enough so that nearly the
 * time, all matches found in a given block can fit in the match cache.
 * However, fallback behavior (immediately terminating the block) on cache
 * overflow is still required.
 */
#  define CACHE_LENGTH      (SOFT_MAX_BLOCK_LENGTH * 5)

#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/*
 * These are the compressor-side limits on the codeword lengths for each Huffman
 * code.  To make outputting bits slightly faster, some of these limits are
 * lower than the limits defined by the DEFLATE format.  This does not
 * significantly affect the compression ratio, at least for the block lengths we
 * use.
 */
#define MAX_LITLEN_CODEWORD_LEN		14
#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN

/* Table: length slot => length slot base value  */
static const unsigned deflate_length_slot_base[] = {
	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
	131 , 163 , 195 , 227 , 258 ,
};

/* Table: length slot => number of extra length bits  */
static const u8 deflate_extra_length_bits[] = {
	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
	5   , 5   , 5   , 5   , 0   ,
};

/* Table: offset slot => offset slot base value  */
static const unsigned deflate_offset_slot_base[] = {
	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
};

/* Table: offset slot => number of extra offset bits  */
static const u8 deflate_extra_offset_bits[] = {
	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
	11   , 11   , 12   , 12    , 13    , 13    ,
};

/* Table: length => length slot  */
static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
	27, 27, 28,
};

/* The order in which precode codeword lengths are stored */
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};

/* Codewords for the DEFLATE Huffman codes.  */
struct deflate_codewords {
	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
};

/* Codeword lengths (in bits) for the DEFLATE Huffman codes.
 * A zero length means the corresponding symbol had zero frequency.  */
struct deflate_lens {
	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
};

/* Codewords and lengths for the DEFLATE Huffman codes.  */
struct deflate_codes {
	struct deflate_codewords codewords;
	struct deflate_lens lens;
};

/* Symbol frequency counters for the DEFLATE Huffman codes.  */
struct deflate_freqs {
	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
};

#if SUPPORT_NEAR_OPTIMAL_PARSING

/* Costs for the near-optimal parsing algorithm.  */
struct deflate_costs {

	/* The cost to output each possible literal.  */
	u32 literal[DEFLATE_NUM_LITERALS];

	/* The cost to output each possible match length.  */
	u32 length[DEFLATE_MAX_MATCH_LEN + 1];

	/* The cost to output a match offset of each possible offset slot.  */
	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
};

/*
 * COST_SHIFT is a scaling factor that makes it possible to consider fractional
 * bit costs.  A token requiring 'n' bits to represent has cost n << COST_SHIFT.
 *
 * Note: this is only useful as a statistical trick for when the true costs are
 * unknown.  In reality, each token in DEFLATE requires a whole number of bits
 * to output.
 */
#define COST_SHIFT	3

/*
 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
 * be needed to output a symbol that was unused in the previous optimization
 * pass.  Assigning a default cost allows the symbol to be used in the next
 * optimization pass.  However, the cost should be relatively high because the
 * symbol probably won't be used very many times (if at all).
 */
#define LITERAL_NOSTAT_BITS	13
#define LENGTH_NOSTAT_BITS	13
#define OFFSET_NOSTAT_BITS	10

#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/*
 * Represents a run of literals followed by a match or end-of-block.  This
 * struct is needed to temporarily store items chosen by the parser, since items
 * cannot be written until all items for the block have been chosen and the
 * block's Huffman codes have been computed.
 */
struct deflate_sequence {

	/* Bits 0..22: the number of literals in this run.  This may be 0 and
	 * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
	 * stored explicitly in this structure; instead, they are read directly
	 * from the uncompressed data.
	 *
	 * Bits 23..31: the length of the match which follows the literals, or 0
	 * if this literal run was the last in the block, so there is no match
	 * which follows it.  */
	u32 litrunlen_and_length;

	/* If 'length' doesn't indicate end-of-block, then this is the offset of
	 * the match which follows the literals.  */
	u16 offset;

	/* If 'length' doesn't indicate end-of-block, then this is the offset
	 * symbol of the match which follows the literals.  */
	u8 offset_symbol;

	/* If 'length' doesn't indicate end-of-block, then this is the length
	 * slot of the match which follows the literals.  */
	u8 length_slot;
};

#if SUPPORT_NEAR_OPTIMAL_PARSING

/*
 * This structure represents a byte position in the input data and a node in the
 * graph of possible match/literal choices for the current block.
 *
 * Logically, each incoming edge to this node is labeled with a literal or a
 * match that can be taken to reach this position from an earlier position; and
 * each outgoing edge from this node is labeled with a literal or a match that
 * can be taken to advance from this position to a later position.
 *
 * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
 * associate with each node just two pieces of information:
 *
 *	'cost_to_end' is the minimum cost to reach the end of the block from
 *	this position.
 *
 *	'item' represents the literal or match that must be chosen from here to
 *	reach the end of the block with the minimum cost.  Equivalently, this
 *	can be interpreted as the label of the outgoing edge on the minimum-cost
 *	path to the "end of block" node from this node.
 */
struct deflate_optimum_node {

	u32 cost_to_end;

	/*
	 * Notes on the match/literal representation used here:
	 *
	 *	The low bits of 'item' are the length: 1 if this is a literal,
	 *	or the match length if this is a match.
	 *
	 *	The high bits of 'item' are the actual literal byte if this is a
	 *	literal, or the match offset if this is a match.
	 */
#define OPTIMUM_OFFSET_SHIFT 9
#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
	u32 item;

};

#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/* Block split statistics.  See "Block splitting algorithm" below. */
#define NUM_LITERAL_OBSERVATION_TYPES 8
#define NUM_MATCH_OBSERVATION_TYPES 2
#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
struct block_split_stats {
	u32 new_observations[NUM_OBSERVATION_TYPES];
	u32 observations[NUM_OBSERVATION_TYPES];
	u32 num_new_observations;
	u32 num_observations;
};

/* The main DEFLATE compressor structure  */
struct libdeflate_compressor {

	/* Pointer to the compress() implementation chosen at allocation time */
	size_t (*impl)(struct libdeflate_compressor *,
		       const u8 *, size_t, u8 *, size_t);

	/* Frequency counters for the current block  */
	struct deflate_freqs freqs;

	/* Dynamic Huffman codes for the current block  */
	struct deflate_codes codes;

	/* Static Huffman codes */
	struct deflate_codes static_codes;

	/* Block split statistics for the currently pending block */
	struct block_split_stats split_stats;

	/* A table for fast lookups of offset slot by match offset.
	 *
	 * If the full table is being used, it is a direct mapping from offset
	 * to offset slot.
	 *
	 * If the condensed table is being used, the first 256 entries map
	 * directly to the offset slots of offsets 1 through 256.  The next 256
	 * entries map to the offset slots for the remaining offsets, stepping
	 * through the offsets with a stride of 128.  This relies on the fact
	 * that each of the remaining offset slots contains at least 128 offsets
	 * and has an offset base that is a multiple of 128.  */
#if USE_FULL_OFFSET_SLOT_FAST
	u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
#else
	u8 offset_slot_fast[512];
#endif

	/* The "nice" match length: if a match of this length is found, choose
	 * it immediately without further consideration.  */
	unsigned nice_match_length;

	/* The maximum search depth: consider at most this many potential
	 * matches at each position.  */
	unsigned max_search_depth;

	/* The compression level with which this compressor was created.  */
	unsigned compression_level;

	/* Anything smaller than this we won't bother trying to compress.  */
	unsigned min_size_to_compress;

	/* Temporary space for Huffman code output  */
	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
	unsigned num_litlen_syms;
	unsigned num_offset_syms;
	unsigned num_explicit_lens;
	unsigned num_precode_items;

	union {
		/* Data for greedy or lazy parsing  */
		struct {
			/* Hash chain matchfinder  */
			struct hc_matchfinder hc_mf;

			/* The matches and literals that the parser has chosen
			 * for the current block.  The required length of this
			 * array is limited by the maximum number of matches
			 * that can ever be chosen for a single block, plus one
			 * for the special entry at the end.  */
			struct deflate_sequence sequences[
				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
					     DEFLATE_MIN_MATCH_LEN) + 1];
		} g; /* (g)reedy */

	#if SUPPORT_NEAR_OPTIMAL_PARSING
		/* Data for near-optimal parsing  */
		struct {

			/* Binary tree matchfinder  */
			struct bt_matchfinder bt_mf;

			/*
			 * Cached matches for the current block.  This array
			 * contains the matches that were found at each position
			 * in the block.  Specifically, for each position, there
			 * is a list of matches found at that position, if any,
			 * sorted by strictly increasing length.  In addition,
			 * following the matches for each position, there is a
			 * special 'struct lz_match' whose 'length' member
			 * contains the number of matches found at that
			 * position, and whose 'offset' member contains the
			 * literal at that position.
			 *
			 * Note: in rare cases, there will be a very high number
			 * of matches in the block and this array will overflow.
			 * If this happens, we force the end of the current
			 * block.  CACHE_LENGTH is the length at which we
			 * actually check for overflow.  The extra slots beyond
			 * this are enough to absorb the worst case overflow,
			 * which occurs if starting at &match_cache[CACHE_LENGTH
			 * - 1], we write MAX_MATCHES_PER_POS matches and a
			 * match count header, then skip searching for matches
			 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
			 * the match count header for each.
			 */
			struct lz_match match_cache[CACHE_LENGTH +
						    MAX_MATCHES_PER_POS +
						    DEFLATE_MAX_MATCH_LEN - 1];

			/*
			 * Array of nodes, one per position, for running the
			 * minimum-cost path algorithm.
			 *
			 * This array must be large enough to accommodate the
			 * worst-case number of nodes, which occurs if we find a
			 * match of length DEFLATE_MAX_MATCH_LEN at position
			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
			 * node.
			 */
			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
								  DEFLATE_MAX_MATCH_LEN + 1];

			/* The current cost model being used.  */
			struct deflate_costs costs;

			unsigned num_optim_passes;
		} n; /* (n)ear-optimal */
	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

	} p; /* (p)arser */
};

/*
 * The type for the bitbuffer variable, which temporarily holds bits that are
 * being packed into bytes and written to the output buffer.  For best
 * performance, this should have size equal to a machine word.
 */
typedef machine_word_t bitbuf_t;
#define BITBUF_NBITS	(8 * sizeof(bitbuf_t))

/* Can the specified number of bits always be added to 'bitbuf' after any
 * pending bytes have been flushed?  */
#define CAN_BUFFER(n)	((n) <= BITBUF_NBITS - 7)

/*
 * Structure to keep track of the current state of sending bits to the
 * compressed output buffer.
 */
struct deflate_output_bitstream {

	/* Bits that haven't yet been written to the output buffer.  */
	bitbuf_t bitbuf;

	/* Number of bits currently held in @bitbuf.  */
	unsigned bitcount;

	/* Pointer to the beginning of the output buffer.  */
	u8 *begin;

	/* Pointer to the position in the output buffer at which the next byte
	 * should be written.  */
	u8 *next;

	/* Pointer just past the end of the output buffer.  */
	u8 *end;
};

/*
 * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
 * present following os->end, in order to not overrun the buffer when generating
 * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
 * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
 * to make the compression algorithm produce the same result on all CPU
 * architectures (which is sometimes desirable), we have to unconditionally use
 * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
 */
#define OUTPUT_END_PADDING	8

/* Initialize the output bitstream.  'size' is assumed to be at least
 * OUTPUT_END_PADDING.  */
static void
deflate_init_output(struct deflate_output_bitstream *os,
		    void *buffer, size_t size)
{
	os->bitbuf = 0;
	os->bitcount = 0;
	os->begin = buffer;
	os->next = os->begin;
	os->end = os->begin + size - OUTPUT_END_PADDING;
}

/* Add some bits to the bitbuffer variable of the output bitstream.  The caller
 * must make sure there is enough room.  */
static forceinline void
deflate_add_bits(struct deflate_output_bitstream *os,
		 const bitbuf_t bits, const unsigned num_bits)
{
	os->bitbuf |= bits << os->bitcount;
	os->bitcount += num_bits;
}

/* Flush bits from the bitbuffer variable to the output buffer.  */
static forceinline void
deflate_flush_bits(struct deflate_output_bitstream *os)
{
	if (UNALIGNED_ACCESS_IS_FAST) {
		/* Flush a whole word (branchlessly).  */
		put_unaligned_leword(os->bitbuf, os->next);
		os->bitbuf >>= os->bitcount & ~7;
		os->next += MIN(os->end - os->next, os->bitcount >> 3);
		os->bitcount &= 7;
	} else {
		/* Flush a byte at a time.  */
		while (os->bitcount >= 8) {
			*os->next = os->bitbuf;
			if (os->next != os->end)
				os->next++;
			os->bitcount -= 8;
			os->bitbuf >>= 8;
		}
	}
}

/* Align the bitstream on a byte boundary. */
static forceinline void
deflate_align_bitstream(struct deflate_output_bitstream *os)
{
	os->bitcount += -os->bitcount & 7;
	deflate_flush_bits(os);
}

/*
 * Flush any remaining bits to the output buffer if needed.  Return the total
 * number of bytes written to the output buffer, or 0 if an overflow occurred.
 */
static size_t
deflate_flush_output(struct deflate_output_bitstream *os)
{
	if (os->next == os->end) /* overflow?  */
		return 0;

	while ((int)os->bitcount > 0) {
		*os->next++ = os->bitbuf;
		os->bitcount -= 8;
		os->bitbuf >>= 8;
	}

	return os->next - os->begin;
}

/* Given the binary tree node A[subtree_idx] whose children already
 * satisfy the maxheap property, swap the node with its greater child
 * until it is greater than both its children, so that the maxheap
 * property is satisfied in the subtree rooted at A[subtree_idx].  */
static void
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
{
	unsigned parent_idx;
	unsigned child_idx;
	u32 v;

	v = A[subtree_idx];
	parent_idx = subtree_idx;
	while ((child_idx = parent_idx * 2) <= length) {
		if (child_idx < length && A[child_idx + 1] > A[child_idx])
			child_idx++;
		if (v >= A[child_idx])
			break;
		A[parent_idx] = A[child_idx];
		parent_idx = child_idx;
	}
	A[parent_idx] = v;
}

/* Rearrange the array 'A' so that it satisfies the maxheap property.
 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
 */
static void
heapify_array(u32 A[], unsigned length)
{
	unsigned subtree_idx;

	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
		heapify_subtree(A, length, subtree_idx);
}

/*
 * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
 *
 * Note: name this function heap_sort() instead of heapsort() to avoid colliding
 * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
 * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
 */
static void
heap_sort(u32 A[], unsigned length)
{
	A--; /* Use 1-based indices  */

	heapify_array(A, length);

	while (length >= 2) {
		u32 tmp = A[length];
		A[length] = A[1];
		A[1] = tmp;
		length--;
		heapify_subtree(A, length, 1);
	}
}

#define NUM_SYMBOL_BITS 10
#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)

#define GET_NUM_COUNTERS(num_syms)	((((num_syms) + 3 / 4) + 3) & ~3)
/*
 * Sort the symbols primarily by frequency and secondarily by symbol
 * value.  Discard symbols with zero frequency and fill in an array with
 * the remaining symbols, along with their frequencies.  The low
 * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
 * value, and the remaining bits will contain the frequency.
 *
 * @num_syms
 *	Number of symbols in the alphabet.
 *	Can't be greater than (1 << NUM_SYMBOL_BITS).
 *
 * @freqs[num_syms]
 *	The frequency of each symbol.
 *
 * @lens[num_syms]
 *	An array that eventually will hold the length of each codeword.
 *	This function only fills in the codeword lengths for symbols that
 *	have zero frequency, which are not well defined per se but will
 *	be set to 0.
 *
 * @symout[num_syms]
 *	The output array, described above.
 *
 * Returns the number of entries in 'symout' that were filled.  This is
 * the number of symbols that have nonzero frequency.
 */
static unsigned
sort_symbols(unsigned num_syms, const u32 freqs[restrict],
	     u8 lens[restrict], u32 symout[restrict])
{
	unsigned sym;
	unsigned i;
	unsigned num_used_syms;
	unsigned num_counters;
	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];

	/* We rely on heapsort, but with an added optimization.  Since
	 * it's common for most symbol frequencies to be low, we first do
	 * a count sort using a limited number of counters.  High
	 * frequencies will be counted in the last counter, and only they
	 * will be sorted with heapsort.
	 *
	 * Note: with more symbols, it is generally beneficial to have more
	 * counters.  About 1 counter per 4 symbols seems fast.
	 *
	 * Note: I also tested radix sort, but even for large symbol
	 * counts (> 255) and frequencies bounded at 16 bits (enabling
	 * radix sort by just two base-256 digits), it didn't seem any
	 * faster than the method implemented here.
	 *
	 * Note: I tested the optimized quicksort implementation from
	 * glibc (with indirection overhead removed), but it was only
	 * marginally faster than the simple heapsort implemented here.
	 *
	 * Tests were done with building the codes for LZX.  Results may
	 * vary for different compression algorithms...!  */

	num_counters = GET_NUM_COUNTERS(num_syms);

	memset(counters, 0, num_counters * sizeof(counters[0]));

	/* Count the frequencies.  */
	for (sym = 0; sym < num_syms; sym++)
		counters[MIN(freqs[sym], num_counters - 1)]++;

	/* Make the counters cumulative, ignoring the zero-th, which
	 * counted symbols with zero frequency.  As a side effect, this
	 * calculates the number of symbols with nonzero frequency.  */
	num_used_syms = 0;
	for (i = 1; i < num_counters; i++) {
		unsigned count = counters[i];
		counters[i] = num_used_syms;
		num_used_syms += count;
	}

	/* Sort nonzero-frequency symbols using the counters.  At the
	 * same time, set the codeword lengths of zero-frequency symbols
	 * to 0.  */
	for (sym = 0; sym < num_syms; sym++) {
		u32 freq = freqs[sym];
		if (freq != 0) {
			symout[counters[MIN(freq, num_counters - 1)]++] =
				sym | (freq << NUM_SYMBOL_BITS);
		} else {
			lens[sym] = 0;
		}
	}

	/* Sort the symbols counted in the last counter.  */
	heap_sort(symout + counters[num_counters - 2],
		  counters[num_counters - 1] - counters[num_counters - 2]);

	return num_used_syms;
}

/*
 * Build the Huffman tree.
 *
 * This is an optimized implementation that
 *	(a) takes advantage of the frequencies being already sorted;
 *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
 *	    Huffman tree are sufficient to generate a canonical code;
 *	(c) Only stores parent pointers, not child pointers;
 *	(d) Produces the nodes in the same memory used for input
 *	    frequency information.
 *
 * Array 'A', which contains 'sym_count' entries, is used for both input
 * and output.  For this function, 'sym_count' must be at least 2.
 *
 * For input, the array must contain the frequencies of the symbols,
 * sorted in increasing order.  Specifically, each entry must contain a
 * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
 * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
 * Although these bits will, in fact, contain the symbols that correspond
 * to the frequencies, this function is concerned with frequencies only
 * and keeps the symbols as-is.
 *
 * For output, this function will produce the non-leaf nodes of the
 * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
 * entries of the array.  Entry A[sym_count - 2] will represent the root
 * node.  Each other node will contain the zero-based index of its parent
 * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
 * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
 * note that although these low bits will, in fact, contain a symbol
 * value, this symbol will have *no relationship* with the Huffman tree
 * node that happens to occupy the same slot.  This is because this
 * implementation only generates the non-leaf nodes of the tree.
 */
static void
build_tree(u32 A[], unsigned sym_count)
{
	/* Index, in 'A', of next lowest frequency symbol that has not
	 * yet been processed.  */
	unsigned i = 0;

	/* Index, in 'A', of next lowest frequency parentless non-leaf
	 * node; or, if equal to 'e', then no such node exists yet.  */
	unsigned b = 0;

	/* Index, in 'A', of next node to allocate as a non-leaf.  */
	unsigned e = 0;

	do {
		unsigned m, n;
		u32 freq_shifted;

		/* Choose the two next lowest frequency entries.  */

		if (i != sym_count &&
		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
			m = i++;
		else
			m = b++;

		if (i != sym_count &&
		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
			n = i++;
		else
			n = b++;

		/* Allocate a non-leaf node and link the entries to it.
		 *
		 * If we link an entry that we're visiting for the first
		 * time (via index 'i'), then we're actually linking a
		 * leaf node and it will have no effect, since the leaf
		 * will be overwritten with a non-leaf when index 'e'
		 * catches up to it.  But it's not any slower to
		 * unconditionally set the parent index.
		 *
		 * We also compute the frequency of the non-leaf node as
		 * the sum of its two children's frequencies.  */

		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);

		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
		e++;
	} while (sym_count - e > 1);
		/* When just one entry remains, it is a "leaf" that was
		 * linked to some other node.  We ignore it, since the
		 * rest of the array contains the non-leaves which we
		 * need.  (Note that we're assuming the cases with 0 or 1
		 * symbols were handled separately.) */
}

/*
 * Given the stripped-down Huffman tree constructed by build_tree(),
 * determine the number of codewords that should be assigned each
 * possible length, taking into account the length-limited constraint.
 *
 * @A
 *	The array produced by build_tree(), containing parent index
 *	information for the non-leaf nodes of the Huffman tree.  Each
 *	entry in this array is a node; a node's parent always has a
 *	greater index than that node itself.  This function will
 *	overwrite the parent index information in this array, so
 *	essentially it will destroy the tree.  However, the data in the
 *	low NUM_SYMBOL_BITS of each entry will be preserved.
 *
 * @root_idx
 *	The 0-based index of the root node in 'A', and consequently one
 *	less than the number of tree node entries in 'A'.  (Or, really 2
 *	less than the actual length of 'A'.)
 *
 * @len_counts
 *	An array of length ('max_codeword_len' + 1) in which the number of
 *	codewords having each length <= max_codeword_len will be
 *	returned.
 *
 * @max_codeword_len
 *	The maximum permissible codeword length.
 */
static void
compute_length_counts(u32 A[restrict], unsigned root_idx,
		      unsigned len_counts[restrict], unsigned max_codeword_len)
{
	unsigned len;
	int node;

	/* The key observations are:
	 *
	 * (1) We can traverse the non-leaf nodes of the tree, always
	 * visiting a parent before its children, by simply iterating
	 * through the array in reverse order.  Consequently, we can
	 * compute the depth of each node in one pass, overwriting the
	 * parent indices with depths.
	 *
	 * (2) We can initially assume that in the real Huffman tree,
	 * both children of the root are leaves.  This corresponds to two
	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
	 * node during the traversal, we modify this assumption to
	 * account for the current node *not* being a leaf, but rather
	 * its two children being leaves.  This causes the loss of one
	 * codeword for the current depth and the addition of two
	 * codewords for the current depth plus one.
	 *
	 * (3) We can handle the length-limited constraint fairly easily
	 * by simply using the largest length available when a depth
	 * exceeds max_codeword_len.
	 */

	for (len = 0; len <= max_codeword_len; len++)
		len_counts[len] = 0;
	len_counts[1] = 2;

	/* Set the root node's depth to 0.  */
	A[root_idx] &= SYMBOL_MASK;

	for (node = root_idx - 1; node >= 0; node--) {

		/* Calculate the depth of this node.  */

		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
		unsigned depth = parent_depth + 1;
		unsigned len = depth;

		/* Set the depth of this node so that it is available
		 * when its children (if any) are processed.  */

		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);

		/* If needed, decrease the length to meet the
		 * length-limited constraint.  This is not the optimal
		 * method for generating length-limited Huffman codes!
		 * But it should be good enough.  */
		if (len >= max_codeword_len) {
			len = max_codeword_len;
			do {
				len--;
			} while (len_counts[len] == 0);
		}

		/* Account for the fact that we have a non-leaf node at
		 * the current depth.  */
		len_counts[len]--;
		len_counts[len + 1] += 2;
	}
}

/*
 * Generate the codewords for a canonical Huffman code.
 *
 * @A
 *	The output array for codewords.  In addition, initially this
 *	array must contain the symbols, sorted primarily by frequency and
 *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
 *	each entry.
 *
 * @len
 *	Output array for codeword lengths.
 *
 * @len_counts
 *	An array that provides the number of codewords that will have
 *	each possible length <= max_codeword_len.
 *
 * @max_codeword_len
 *	Maximum length, in bits, of each codeword.
 *
 * @num_syms
 *	Number of symbols in the alphabet, including symbols with zero
 *	frequency.  This is the length of the 'A' and 'len' arrays.
 */
static void
gen_codewords(u32 A[restrict], u8 lens[restrict],
	      const unsigned len_counts[restrict],
	      unsigned max_codeword_len, unsigned num_syms)
{
	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
	unsigned i;
	unsigned len;
	unsigned sym;

	/* Given the number of codewords that will have each length,
	 * assign codeword lengths to symbols.  We do this by assigning
	 * the lengths in decreasing order to the symbols sorted
	 * primarily by increasing frequency and secondarily by
	 * increasing symbol value.  */
	for (i = 0, len = max_codeword_len; len >= 1; len--) {
		unsigned count = len_counts[len];
		while (count--)
			lens[A[i++] & SYMBOL_MASK] = len;
	}

	/* Generate the codewords themselves.  We initialize the
	 * 'next_codewords' array to provide the lexicographically first
	 * codeword of each length, then assign codewords in symbol
	 * order.  This produces a canonical code.  */
	next_codewords[0] = 0;
	next_codewords[1] = 0;
	for (len = 2; len <= max_codeword_len; len++)
		next_codewords[len] =
			(next_codewords[len - 1] + len_counts[len - 1]) << 1;

	for (sym = 0; sym < num_syms; sym++)
		A[sym] = next_codewords[lens[sym]]++;
}

/*
 * ---------------------------------------------------------------------
 *			make_canonical_huffman_code()
 * ---------------------------------------------------------------------
 *
 * Given an alphabet and the frequency of each symbol in it, construct a
 * length-limited canonical Huffman code.
 *
 * @num_syms
 *	The number of symbols in the alphabet.  The symbols are the
 *	integers in the range [0, num_syms - 1].  This parameter must be
 *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
 *
 * @max_codeword_len
 *	The maximum permissible codeword length.
 *
 * @freqs
 *	An array of @num_syms entries, each of which specifies the
 *	frequency of the corresponding symbol.  It is valid for some,
 *	none, or all of the frequencies to be 0.
 *
 * @lens
 *	An array of @num_syms entries in which this function will return
 *	the length, in bits, of the codeword assigned to each symbol.
 *	Symbols with 0 frequency will not have codewords per se, but
 *	their entries in this array will be set to 0.  No lengths greater
 *	than @max_codeword_len will be assigned.
 *
 * @codewords
 *	An array of @num_syms entries in which this function will return
 *	the codeword for each symbol, right-justified and padded on the
 *	left with zeroes.  Codewords for symbols with 0 frequency will be
 *	undefined.
 *
 * ---------------------------------------------------------------------
 *
 * This function builds a length-limited canonical Huffman code.
 *
 * A length-limited Huffman code contains no codewords longer than some
 * specified length, and has exactly (with some algorithms) or
 * approximately (with the algorithm used here) the minimum weighted path
 * length from the root, given this constraint.
 *
 * A canonical Huffman code satisfies the properties that a longer
 * codeword never lexicographically precedes a shorter codeword, and the
 * lexicographic ordering of codewords of the same length is the same as
 * the lexicographic ordering of the corresponding symbols.  A canonical
 * Huffman code, or more generally a canonical prefix code, can be
 * reconstructed from only a list containing the codeword length of each
 * symbol.
 *
 * The classic algorithm to generate a Huffman code creates a node for
 * each symbol, then inserts these nodes into a min-heap keyed by symbol
 * frequency.  Then, repeatedly, the two lowest-frequency nodes are
 * removed from the min-heap and added as the children of a new node
 * having frequency equal to the sum of its two children, which is then
 * inserted into the min-heap.  When only a single node remains in the
 * min-heap, it is the root of the Huffman tree.  The codeword for each
 * symbol is determined by the path needed to reach the corresponding
 * node from the root.  Descending to the left child appends a 0 bit,
 * whereas descending to the right child appends a 1 bit.
 *
 * The classic algorithm is relatively easy to understand, but it is
 * subject to a number of inefficiencies.  In practice, it is fastest to
 * first sort the symbols by frequency.  (This itself can be subject to
 * an optimization based on the fact that most frequencies tend to be
 * low.)  At the same time, we sort secondarily by symbol value, which
 * aids the process of generating a canonical code.  Then, during tree
 * construction, no heap is necessary because both the leaf nodes and the
 * unparented non-leaf nodes can be easily maintained in sorted order.
 * Consequently, there can never be more than two possibilities for the
 * next-lowest-frequency node.
 *
 * In addition, because we're generating a canonical code, we actually
 * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
 * This is because for canonical code generation we don't need to know
 * where the symbols are in the tree.  Rather, we only need to know how
 * many leaf nodes have each depth (codeword length).  And this
 * information can, in fact, be quickly generated from the tree of
 * non-leaves only.
 *
 * Furthermore, we can build this stripped-down Huffman tree directly in
 * the array in which the codewords are to be generated, provided that
 * these array slots are large enough to hold a symbol and frequency
 * value.
 *
 * Still furthermore, we don't even need to maintain explicit child
 * pointers.  We only need the parent pointers, and even those can be
 * overwritten in-place with depth information as part of the process of
 * extracting codeword lengths from the tree.  So in summary, we do NOT
 * need a big structure like:
 *
 *	struct huffman_tree_node {
 *		unsigned int symbol;
 *		unsigned int frequency;
 *		unsigned int depth;
 *		struct huffman_tree_node *left_child;
 *		struct huffman_tree_node *right_child;
 *	};
 *
 *
 *   ... which often gets used in "naive" implementations of Huffman code
 *   generation.
 *
 * Many of these optimizations are based on the implementation in 7-Zip
 * (source file: C/HuffEnc.c), which has been placed in the public domain
 * by Igor Pavlov.
 */
static void
make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
			    const u32 freqs[restrict],
			    u8 lens[restrict], u32 codewords[restrict])
{
	u32 *A = codewords;
	unsigned num_used_syms;

	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);

	/* We begin by sorting the symbols primarily by frequency and
	 * secondarily by symbol value.  As an optimization, the array
	 * used for this purpose ('A') shares storage with the space in
	 * which we will eventually return the codewords.  */

	num_used_syms = sort_symbols(num_syms, freqs, lens, A);

	/* 'num_used_syms' is the number of symbols with nonzero
	 * frequency.  This may be less than @num_syms.  'num_used_syms'
	 * is also the number of entries in 'A' that are valid.  Each
	 * entry consists of a distinct symbol and a nonzero frequency
	 * packed into a 32-bit integer.  */

	/* Handle special cases where only 0 or 1 symbols were used (had
	 * nonzero frequency).  */

	if (unlikely(num_used_syms == 0)) {
		/* Code is empty.  sort_symbols() already set all lengths
		 * to 0, so there is nothing more to do.  */
		return;
	}

	if (unlikely(num_used_syms == 1)) {
		/* Only one symbol was used, so we only need one
		 * codeword.  But two codewords are needed to form the
		 * smallest complete Huffman code, which uses codewords 0
		 * and 1.  Therefore, we choose another symbol to which
		 * to assign a codeword.  We use 0 (if the used symbol is
		 * not 0) or 1 (if the used symbol is 0).  In either
		 * case, the lesser-valued symbol must be assigned
		 * codeword 0 so that the resulting code is canonical.  */

		unsigned sym = A[0] & SYMBOL_MASK;
		unsigned nonzero_idx = sym ? sym : 1;

		codewords[0] = 0;
		lens[0] = 1;
		codewords[nonzero_idx] = 1;
		lens[nonzero_idx] = 1;
		return;
	}

	/* Build a stripped-down version of the Huffman tree, sharing the
	 * array 'A' with the symbol values.  Then extract length counts
	 * from the tree and use them to generate the final codewords.  */

	build_tree(A, num_used_syms);

	{
		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];

		compute_length_counts(A, num_used_syms - 2,
				      len_counts, max_codeword_len);

		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
	}
}

/*
 * Clear the Huffman symbol frequency counters.
 * This must be called when starting a new DEFLATE block.
 */
static void
deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
{
	memset(&c->freqs, 0, sizeof(c->freqs));
}

/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
static u32
deflate_reverse_codeword(u32 codeword, u8 len)
{
	/* The following branchless algorithm is faster than going bit by bit.
	 * Note: since no codewords are longer than 16 bits, we only need to
	 * reverse the low 16 bits of the 'u32'.  */
	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);

	/* Flip adjacent 1-bit fields  */
	codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);

	/* Flip adjacent 2-bit fields  */
	codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);

	/* Flip adjacent 4-bit fields  */
	codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);

	/* Flip adjacent 8-bit fields  */
	codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);

	/* Return the high 'len' bits of the bit-reversed 16 bit value.  */
	return codeword >> (16 - len);
}

/* Make a canonical Huffman code with bit-reversed codewords.  */
static void
deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
			  const u32 freqs[], u8 lens[], u32 codewords[])
{
	unsigned sym;

	make_canonical_huffman_code(num_syms, max_codeword_len,
				    freqs, lens, codewords);

	for (sym = 0; sym < num_syms; sym++)
		codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
}

/*
 * Build the literal/length and offset Huffman codes for a DEFLATE block.
 *
 * This takes as input the frequency tables for each code and produces as output
 * a set of tables that map symbols to codewords and codeword lengths.
 */
static void
deflate_make_huffman_codes(const struct deflate_freqs *freqs,
			   struct deflate_codes *codes)
{
	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);

	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
				  MAX_LITLEN_CODEWORD_LEN,
				  freqs->litlen,
				  codes->lens.litlen,
				  codes->codewords.litlen);

	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
				  MAX_OFFSET_CODEWORD_LEN,
				  freqs->offset,
				  codes->lens.offset,
				  codes->codewords.offset);
}

/* Initialize c->static_codes.  */
static void
deflate_init_static_codes(struct libdeflate_compressor *c)
{
	unsigned i;

	for (i = 0; i < 144; i++)
		c->freqs.litlen[i] = 1 << (9 - 8);
	for (; i < 256; i++)
		c->freqs.litlen[i] = 1 << (9 - 9);
	for (; i < 280; i++)
		c->freqs.litlen[i] = 1 << (9 - 7);
	for (; i < 288; i++)
		c->freqs.litlen[i] = 1 << (9 - 8);

	for (i = 0; i < 32; i++)
		c->freqs.offset[i] = 1 << (5 - 5);

	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
}

/* Return the offset slot for the specified match offset.  */
static forceinline unsigned
deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
{
#if USE_FULL_OFFSET_SLOT_FAST
	return c->offset_slot_fast[offset];
#else
	if (offset <= 256)
		return c->offset_slot_fast[offset - 1];
	else
		return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
#endif
}

/* Write the header fields common to all DEFLATE block types.  */
static void
deflate_write_block_header(struct deflate_output_bitstream *os,
			   bool is_final_block, unsigned block_type)
{
	deflate_add_bits(os, is_final_block, 1);
	deflate_add_bits(os, block_type, 2);
	deflate_flush_bits(os);
}

static unsigned
deflate_compute_precode_items(const u8 lens[restrict],
			      const unsigned num_lens,
			      u32 precode_freqs[restrict],
			      unsigned precode_items[restrict])
{
	unsigned *itemptr;
	unsigned run_start;
	unsigned run_end;
	unsigned extra_bits;
	u8 len;

	memset(precode_freqs, 0,
	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));

	itemptr = precode_items;
	run_start = 0;
	do {
		/* Find the next run of codeword lengths.  */

		/* len = the length being repeated  */
		len = lens[run_start];

		/* Extend the run.  */
		run_end = run_start;
		do {
			run_end++;
		} while (run_end != num_lens && len == lens[run_end]);

		if (len == 0) {
			/* Run of zeroes.  */

			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
			while ((run_end - run_start) >= 11) {
				extra_bits = MIN((run_end - run_start) - 11, 0x7F);
				precode_freqs[18]++;
				*itemptr++ = 18 | (extra_bits << 5);
				run_start += 11 + extra_bits;
			}

			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
			if ((run_end - run_start) >= 3) {
				extra_bits = MIN((run_end - run_start) - 3, 0x7);
				precode_freqs[17]++;
				*itemptr++ = 17 | (extra_bits << 5);
				run_start += 3 + extra_bits;
			}
		} else {

			/* A run of nonzero lengths. */

			/* Symbol 16: RLE 3 to 6 of the previous length.  */
			if ((run_end - run_start) >= 4) {
				precode_freqs[len]++;
				*itemptr++ = len;
				run_start++;
				do {
					extra_bits = MIN((run_end - run_start) - 3, 0x3);
					precode_freqs[16]++;
					*itemptr++ = 16 | (extra_bits << 5);
					run_start += 3 + extra_bits;
				} while ((run_end - run_start) >= 3);
			}
		}

		/* Output any remaining lengths without RLE.  */
		while (run_start != run_end) {
			precode_freqs[len]++;
			*itemptr++ = len;
			run_start++;
		}
	} while (run_start != num_lens);

	return itemptr - precode_items;
}

/*
 * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
 * separate Huffman code, the "precode", which contains a symbol for each
 * possible codeword length in the larger code as well as several special
 * symbols to represent repeated codeword lengths (a form of run-length
 * encoding).  The precode is itself constructed in canonical form, and its
 * codeword lengths are represented literally in 19 3-bit fields that
 * immediately precede the compressed codeword lengths of the larger code.
 */

/* Precompute the information needed to output Huffman codes. */
static void
deflate_precompute_huffman_header(struct libdeflate_compressor *c)
{
	/* Compute how many litlen and offset symbols are needed. */

	for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
	     c->num_litlen_syms > 257;
	     c->num_litlen_syms--)
		if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
			break;

	for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
	     c->num_offset_syms > 1;
	     c->num_offset_syms--)
		if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
			break;

	/* If we're not using the full set of literal/length codeword lengths,
	 * then temporarily move the offset codeword lengths over so that the
	 * literal/length and offset codeword lengths are contiguous. */

	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
		      DEFLATE_NUM_LITLEN_SYMS);

	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
		memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
			c->num_offset_syms);
	}

	/* Compute the "items" (RLE / literal tokens and extra bits) with which
	 * the codeword lengths in the larger code will be output. */
	c->num_precode_items =
		deflate_compute_precode_items((u8 *)&c->codes.lens,
					      c->num_litlen_syms +
							c->num_offset_syms,
					      c->precode_freqs,
					      c->precode_items);

	/* Build the precode. */
	STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
				  MAX_PRE_CODEWORD_LEN,
				  c->precode_freqs, c->precode_lens,
				  c->precode_codewords);

	/* Count how many precode lengths we actually need to output. */
	for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
	     c->num_explicit_lens > 4;
	     c->num_explicit_lens--)
		if (c->precode_lens[deflate_precode_lens_permutation[
						c->num_explicit_lens - 1]] != 0)
			break;

	/* Restore the offset codeword lengths if needed. */
	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
			(u8 *)&c->codes.lens + c->num_litlen_syms,
			c->num_offset_syms);
	}
}

/* Output the Huffman codes. */
static void
deflate_write_huffman_header(struct libdeflate_compressor *c,
			     struct deflate_output_bitstream *os)
{
	unsigned i;

	deflate_add_bits(os, c->num_litlen_syms - 257, 5);
	deflate_add_bits(os, c->num_offset_syms - 1, 5);
	deflate_add_bits(os, c->num_explicit_lens - 4, 4);
	deflate_flush_bits(os);

	/* Output the lengths of the codewords in the precode.  */
	for (i = 0; i < c->num_explicit_lens; i++) {
		deflate_add_bits(os, c->precode_lens[
				       deflate_precode_lens_permutation[i]], 3);
		deflate_flush_bits(os);
	}

	/* Output the encoded lengths of the codewords in the larger code.  */
	for (i = 0; i < c->num_precode_items; i++) {
		unsigned precode_item = c->precode_items[i];
		unsigned precode_sym = precode_item & 0x1F;
		deflate_add_bits(os, c->precode_codewords[precode_sym],
				 c->precode_lens[precode_sym]);
		if (precode_sym >= 16) {
			if (precode_sym == 16)
				deflate_add_bits(os, precode_item >> 5, 2);
			else if (precode_sym == 17)
				deflate_add_bits(os, precode_item >> 5, 3);
			else
				deflate_add_bits(os, precode_item >> 5, 7);
		}
		STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
		deflate_flush_bits(os);
	}
}

static void
deflate_write_sequences(struct deflate_output_bitstream * restrict os,
			const struct deflate_codes * restrict codes,
			const struct deflate_sequence sequences[restrict],
			const u8 * restrict in_next)
{
	const struct deflate_sequence *seq = sequences;

	for (;;) {
		u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
		unsigned length = seq->litrunlen_and_length >> 23;
		unsigned length_slot;
		unsigned litlen_symbol;
		unsigned offset_symbol;

		if (litrunlen) {
		#if 1
			while (litrunlen >= 4) {
				unsigned lit0 = in_next[0];
				unsigned lit1 = in_next[1];
				unsigned lit2 = in_next[2];
				unsigned lit3 = in_next[3];

				deflate_add_bits(os, codes->codewords.litlen[lit0],
						 codes->lens.litlen[lit0]);
				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);

				deflate_add_bits(os, codes->codewords.litlen[lit1],
						 codes->lens.litlen[lit1]);
				if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);

				deflate_add_bits(os, codes->codewords.litlen[lit2],
						 codes->lens.litlen[lit2]);
				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);

				deflate_add_bits(os, codes->codewords.litlen[lit3],
						 codes->lens.litlen[lit3]);
				deflate_flush_bits(os);
				in_next += 4;
				litrunlen -= 4;
			}
			if (litrunlen-- != 0) {
				deflate_add_bits(os, codes->codewords.litlen[*in_next],
						 codes->lens.litlen[*in_next]);
				if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);
				in_next++;
				if (litrunlen-- != 0) {
					deflate_add_bits(os, codes->codewords.litlen[*in_next],
							 codes->lens.litlen[*in_next]);
					if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
						deflate_flush_bits(os);
					in_next++;
					if (litrunlen-- != 0) {
						deflate_add_bits(os, codes->codewords.litlen[*in_next],
								 codes->lens.litlen[*in_next]);
						if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
							deflate_flush_bits(os);
						in_next++;
					}
				}
				if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
					deflate_flush_bits(os);
			}
		#else
			do {
				unsigned lit = *in_next++;
				deflate_add_bits(os, codes->codewords.litlen[lit],
						 codes->lens.litlen[lit]);
				deflate_flush_bits(os);
			} while (--litrunlen);
		#endif
		}

		if (length == 0)
			return;

		in_next += length;

		length_slot = seq->length_slot;
		litlen_symbol = 257 + length_slot;

		/* Litlen symbol  */
		deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
				 codes->lens.litlen[litlen_symbol]);

		/* Extra length bits  */
		STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
					 DEFLATE_MAX_EXTRA_LENGTH_BITS));
		deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
				 deflate_extra_length_bits[length_slot]);

		if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
				DEFLATE_MAX_EXTRA_LENGTH_BITS +
				MAX_OFFSET_CODEWORD_LEN +
				DEFLATE_MAX_EXTRA_OFFSET_BITS))
			deflate_flush_bits(os);

		/* Offset symbol  */
		offset_symbol = seq->offset_symbol;
		deflate_add_bits(os, codes->codewords.offset[offset_symbol],
				 codes->lens.offset[offset_symbol]);

		if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
				DEFLATE_MAX_EXTRA_OFFSET_BITS))
			deflate_flush_bits(os);

		/* Extra offset bits  */
		deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
				 deflate_extra_offset_bits[offset_symbol]);

		deflate_flush_bits(os);

		seq++;
	}
}

#if SUPPORT_NEAR_OPTIMAL_PARSING
/*
 * Follow the minimum-cost path in the graph of possible match/literal choices
 * for the current block and write out the matches/literals using the specified
 * Huffman codes.
 *
 * Note: this is slightly duplicated with deflate_write_sequences(), the reason
 * being that we don't want to waste time translating between intermediate
 * match/literal representations.
 */
static void
deflate_write_item_list(struct deflate_output_bitstream *os,
			const struct deflate_codes *codes,
			struct libdeflate_compressor *c,
			u32 block_length)
{
	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
	struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
	do {
		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
		unsigned litlen_symbol;
		unsigned length_slot;
		unsigned offset_slot;

		if (length == 1) {
			/* Literal  */
			litlen_symbol = offset;
			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
					 codes->lens.litlen[litlen_symbol]);
			deflate_flush_bits(os);
		} else {
			/* Match length  */
			length_slot = deflate_length_slot[length];
			litlen_symbol = 257 + length_slot;
			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
					 codes->lens.litlen[litlen_symbol]);

			deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
					 deflate_extra_length_bits[length_slot]);

			if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
					DEFLATE_MAX_EXTRA_LENGTH_BITS +
					MAX_OFFSET_CODEWORD_LEN +
					DEFLATE_MAX_EXTRA_OFFSET_BITS))
				deflate_flush_bits(os);


			/* Match offset  */
			offset_slot = deflate_get_offset_slot(c, offset);
			deflate_add_bits(os, codes->codewords.offset[offset_slot],
					 codes->lens.offset[offset_slot]);

			if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
					DEFLATE_MAX_EXTRA_OFFSET_BITS))
				deflate_flush_bits(os);

			deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
					 deflate_extra_offset_bits[offset_slot]);

			deflate_flush_bits(os);
		}
		cur_node += length;
	} while (cur_node != end_node);
}
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/* Output the end-of-block symbol.  */
static void
deflate_write_end_of_block(struct deflate_output_bitstream *os,
			   const struct deflate_codes *codes)
{
	deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
			 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
	deflate_flush_bits(os);
}

static void
deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
				 const u8 *data, u16 len,
				 bool is_final_block)
{
	deflate_write_block_header(os, is_final_block,
				   DEFLATE_BLOCKTYPE_UNCOMPRESSED);
	deflate_align_bitstream(os);

	if (4 + (u32)len >= os->end - os->next) {
		os->next = os->end;
		return;
	}

	put_unaligned_le16(len, os->next);
	os->next += 2;
	put_unaligned_le16(~len, os->next);
	os->next += 2;
	memcpy(os->next, data, len);
	os->next += len;
}

static void
deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
				  const u8 *data, size_t data_length,
				  bool is_final_block)
{
	do {
		u16 len = MIN(data_length, UINT16_MAX);

		deflate_write_uncompressed_block(os, data, len,
					is_final_block && len == data_length);
		data += len;
		data_length -= len;
	} while (data_length != 0);
}

/*
 * Choose the best type of block to use (dynamic Huffman, static Huffman, or
 * uncompressed), then output it.
 */
static void
deflate_flush_block(struct libdeflate_compressor * restrict c,
		    struct deflate_output_bitstream * restrict os,
		    const u8 * restrict block_begin, u32 block_length,
		    bool is_final_block, bool use_item_list)
{
	static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
	};

	/* Costs are measured in bits */
	u32 dynamic_cost = 0;
	u32 static_cost = 0;
	u32 uncompressed_cost = 0;
	struct deflate_codes *codes;
	int block_type;
	unsigned sym;

	/* Tally the end-of-block symbol. */
	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;

	/* Build dynamic Huffman codes. */
	deflate_make_huffman_codes(&c->freqs, &c->codes);

	/* Account for the cost of sending dynamic Huffman codes. */
	deflate_precompute_huffman_header(c);
	dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
		u32 extra = deflate_extra_precode_bits[sym];
		dynamic_cost += c->precode_freqs[sym] *
				(extra + c->precode_lens[sym]);
	}

	/* Account for the cost of encoding literals. */
	for (sym = 0; sym < 256; sym++) {
		dynamic_cost += c->freqs.litlen[sym] *
				c->codes.lens.litlen[sym];
	}
	for (sym = 0; sym < 144; sym++)
		static_cost += c->freqs.litlen[sym] * 8;
	for (; sym < 256; sym++)
		static_cost += c->freqs.litlen[sym] * 9;

	/* Account for the cost of encoding the end-of-block symbol. */
	dynamic_cost += c->codes.lens.litlen[256];
	static_cost += 7;

	/* Account for the cost of encoding lengths. */
	for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
		u32 extra = deflate_extra_length_bits[sym - 257];
		dynamic_cost += c->freqs.litlen[sym] *
				(extra + c->codes.lens.litlen[sym]);
		static_cost += c->freqs.litlen[sym] *
				(extra + c->static_codes.lens.litlen[sym]);
	}

	/* Account for the cost of encoding offsets. */
	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
		u32 extra = deflate_extra_offset_bits[sym];
		dynamic_cost += c->freqs.offset[sym] *
				(extra + c->codes.lens.offset[sym]);
		static_cost += c->freqs.offset[sym] * (extra + 5);
	}

	/* Compute the cost of using uncompressed blocks. */
	uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
			     (40 * (DIV_ROUND_UP(block_length,
						 UINT16_MAX) - 1)) +
			     (8 * block_length);

	/* Choose the cheapest block type. */
	if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
		block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
		codes = &c->codes;
	} else if (static_cost < uncompressed_cost) {
		block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
		codes = &c->static_codes;
	} else {
		block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
	}

	/* Now actually output the block. */

	if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
		/* Note: the length being flushed may exceed the maximum length
		 * of an uncompressed block (65535 bytes).  Therefore, more than
		 * one uncompressed block might be needed. */
		deflate_write_uncompressed_blocks(os, block_begin, block_length,
						  is_final_block);
	} else {
		/* Output the block header. */
		deflate_write_block_header(os, is_final_block, block_type);

		/* Output the Huffman codes (dynamic Huffman blocks only). */
		if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
			deflate_write_huffman_header(c, os);

		/* Output the literals, matches, and end-of-block symbol. */
	#if SUPPORT_NEAR_OPTIMAL_PARSING
		if (use_item_list)
			deflate_write_item_list(os, codes, c, block_length);
		else
	#endif
			deflate_write_sequences(os, codes, c->p.g.sequences,
						block_begin);
		deflate_write_end_of_block(os, codes);
	}
}

static forceinline void
deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
		       u32 *litrunlen_p)
{
	c->freqs.litlen[literal]++;
	++*litrunlen_p;
}

static forceinline void
deflate_choose_match(struct libdeflate_compressor *c,
		     unsigned length, unsigned offset,
		     u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
{
	struct deflate_sequence *seq = *next_seq_p;
	unsigned length_slot = deflate_length_slot[length];
	unsigned offset_slot = deflate_get_offset_slot(c, offset);

	c->freqs.litlen[257 + length_slot]++;
	c->freqs.offset[offset_slot]++;

	seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
	seq->offset = offset;
	seq->length_slot = length_slot;
	seq->offset_symbol = offset_slot;

	*litrunlen_p = 0;
	*next_seq_p = seq + 1;
}

static forceinline void
deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
{
	seq->litrunlen_and_length = litrunlen; /* length = 0 */
}

/******************************************************************************/

/*
 * Block splitting algorithm.  The problem is to decide when it is worthwhile to
 * start a new block with new Huffman codes.  There is a theoretically optimal
 * solution: recursively consider every possible block split, considering the
 * exact cost of each block, and choose the minimum cost approach.  But this is
 * far too slow.  Instead, as an approximation, we can count symbols and after
 * every N symbols, compare the expected distribution of symbols based on the
 * previous data with the actual distribution.  If they differ "by enough", then
 * start a new block.
 *
 * As an optimization and heuristic, we don't distinguish between every symbol
 * but rather we combine many symbols into a single "observation type".  For
 * literals we only look at the high bits and low bits, and for matches we only
 * look at whether the match is long or not.  The assumption is that for typical
 * "real" data, places that are good block boundaries will tend to be noticeable
 * based only on changes in these aggregate frequencies, without looking for
 * subtle differences in individual symbols.  For example, a change from ASCII
 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
 * to many matches (generally more compressible), would be easily noticed based
 * on the aggregates.
 *
 * For determining whether the frequency distributions are "different enough" to
 * start a new block, the simply heuristic of splitting when the sum of absolute
 * differences exceeds a constant seems to be good enough.  We also add a number
 * proportional to the block length so that the algorithm is more likely to end
 * long blocks than short blocks.  This reflects the general expectation that it
 * will become increasingly beneficial to start a new block as the current
 * block grows longer.
 *
 * Finally, for an approximation, it is not strictly necessary that the exact
 * symbols being used are considered.  With "near-optimal parsing", for example,
 * the actual symbols that will be used are unknown until after the block
 * boundary is chosen and the block has been optimized.  Since the final choices
 * cannot be used, we can use preliminary "greedy" choices instead.
 */

/* Initialize the block split statistics when starting a new block. */
static void
init_block_split_stats(struct block_split_stats *stats)
{
	int i;

	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
		stats->new_observations[i] = 0;
		stats->observations[i] = 0;
	}
	stats->num_new_observations = 0;
	stats->num_observations = 0;
}

/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
 * literal, for 8 possible literal observation types.  */
static forceinline void
observe_literal(struct block_split_stats *stats, u8 lit)
{
	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
	stats->num_new_observations++;
}

/* Match observation.  Heuristic: use one observation type for "short match" and
 * one observation type for "long match".  */
static forceinline void
observe_match(struct block_split_stats *stats, unsigned length)
{
	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
	stats->num_new_observations++;
}

static bool
do_end_block_check(struct block_split_stats *stats, u32 block_length)
{
	int i;

	if (stats->num_observations > 0) {

		/* Note: to avoid slow divisions, we do not divide by
		 * 'num_observations', but rather do all math with the numbers
		 * multiplied by 'num_observations'.  */
		u32 total_delta = 0;
		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
			u32 expected = stats->observations[i] * stats->num_new_observations;
			u32 actual = stats->new_observations[i] * stats->num_observations;
			u32 delta = (actual > expected) ? actual - expected :
							  expected - actual;
			total_delta += delta;
		}

		/* Ready to end the block? */
		if (total_delta + (block_length / 4096) * stats->num_observations >=
		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
			return true;
	}

	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
		stats->num_observations += stats->new_observations[i];
		stats->observations[i] += stats->new_observations[i];
		stats->new_observations[i] = 0;
	}
	stats->num_new_observations = 0;
	return false;
}

static forceinline bool
should_end_block(struct block_split_stats *stats,
		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
{
	/* Ready to check block split statistics? */
	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
	    in_end - in_next < MIN_BLOCK_LENGTH)
		return false;

	return do_end_block_check(stats, in_next - in_block_begin);
}

/******************************************************************************/

/*
 * This is the level 0 "compressor".  It always outputs uncompressed blocks.
 */
static size_t
deflate_compress_none(struct libdeflate_compressor * restrict c,
		      const u8 * restrict in, size_t in_nbytes,
		      u8 * restrict out, size_t out_nbytes_avail)
{
	struct deflate_output_bitstream os;

	deflate_init_output(&os, out, out_nbytes_avail);

	deflate_write_uncompressed_blocks(&os, in, in_nbytes, true);

	return deflate_flush_output(&os);
}

/*
 * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
 */
static size_t
deflate_compress_greedy(struct libdeflate_compressor * restrict c,
			const u8 * restrict in, size_t in_nbytes,
			u8 * restrict out, size_t out_nbytes_avail)
{
	const u8 *in_next = in;
	const u8 *in_end = in_next + in_nbytes;
	struct deflate_output_bitstream os;
	const u8 *in_cur_base = in_next;
	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
	unsigned nice_len = MIN(c->nice_match_length, max_len);
	u32 next_hashes[2] = {0, 0};

	deflate_init_output(&os, out, out_nbytes_avail);
	hc_matchfinder_init(&c->p.g.hc_mf);

	do {
		/* Starting a new DEFLATE block.  */

		const u8 * const in_block_begin = in_next;
		const u8 * const in_max_block_end =
			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
		u32 litrunlen = 0;
		struct deflate_sequence *next_seq = c->p.g.sequences;

		init_block_split_stats(&c->split_stats);
		deflate_reset_symbol_frequencies(c);

		do {
			u32 length;
			u32 offset;

			/* Decrease the maximum and nice match lengths if we're
			 * approaching the end of the input buffer.  */
			if (unlikely(max_len > in_end - in_next)) {
				max_len = in_end - in_next;
				nice_len = MIN(nice_len, max_len);
			}

			length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
							      &in_cur_base,
							      in_next,
							      DEFLATE_MIN_MATCH_LEN - 1,
							      max_len,
							      nice_len,
							      c->max_search_depth,
							      next_hashes,
							      &offset);

			if (length >= DEFLATE_MIN_MATCH_LEN) {
				/* Match found.  */
				deflate_choose_match(c, length, offset,
						     &litrunlen, &next_seq);
				observe_match(&c->split_stats, length);
				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
									&in_cur_base,
									in_next + 1,
									in_end,
									length - 1,
									next_hashes);
			} else {
				/* No match found.  */
				deflate_choose_literal(c, *in_next, &litrunlen);
				observe_literal(&c->split_stats, *in_next);
				in_next++;
			}

			/* Check if it's time to output another block.  */
		} while (in_next < in_max_block_end &&
			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));

		deflate_finish_sequence(next_seq, litrunlen);
		deflate_flush_block(c, &os, in_block_begin,
				    in_next - in_block_begin,
				    in_next == in_end, false);
	} while (in_next != in_end);

	return deflate_flush_output(&os);
}

/*
 * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
 * see if there's a longer match at the next position.  If yes, it outputs a
 * literal and continues to the next position.  If no, it outputs the match.
 */
static size_t
deflate_compress_lazy(struct libdeflate_compressor * restrict c,
		      const u8 * restrict in, size_t in_nbytes,
		      u8 * restrict out, size_t out_nbytes_avail)
{
	const u8 *in_next = in;
	const u8 *in_end = in_next + in_nbytes;
	struct deflate_output_bitstream os;
	const u8 *in_cur_base = in_next;
	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
	unsigned nice_len = MIN(c->nice_match_length, max_len);
	u32 next_hashes[2] = {0, 0};

	deflate_init_output(&os, out, out_nbytes_avail);
	hc_matchfinder_init(&c->p.g.hc_mf);

	do {
		/* Starting a new DEFLATE block.  */

		const u8 * const in_block_begin = in_next;
		const u8 * const in_max_block_end =
			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
		u32 litrunlen = 0;
		struct deflate_sequence *next_seq = c->p.g.sequences;

		init_block_split_stats(&c->split_stats);
		deflate_reset_symbol_frequencies(c);

		do {
			unsigned cur_len;
			unsigned cur_offset;
			unsigned next_len;
			unsigned next_offset;

			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
				max_len = in_end - in_next;
				nice_len = MIN(nice_len, max_len);
			}

			/* Find the longest match at the current position.  */
			cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
							       &in_cur_base,
							       in_next,
							       DEFLATE_MIN_MATCH_LEN - 1,
							       max_len,
							       nice_len,
							       c->max_search_depth,
							       next_hashes,
							       &cur_offset);
			in_next += 1;

			if (cur_len < DEFLATE_MIN_MATCH_LEN) {
				/* No match found.  Choose a literal.  */
				deflate_choose_literal(c, *(in_next - 1), &litrunlen);
				observe_literal(&c->split_stats, *(in_next - 1));
				continue;
			}

		have_cur_match:
			observe_match(&c->split_stats, cur_len);

			/* We have a match at the current position.  */

			/* If the current match is very long, choose it
			 * immediately.  */
			if (cur_len >= nice_len) {
				deflate_choose_match(c, cur_len, cur_offset,
						     &litrunlen, &next_seq);
				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
									&in_cur_base,
									in_next,
									in_end,
									cur_len - 1,
									next_hashes);
				continue;
			}

			/*
			 * Try to find a match at the next position.
			 *
			 * Note: since we already have a match at the *current*
			 * position, we use only half the 'max_search_depth'
			 * when checking the *next* position.  This is a useful
			 * trade-off because it's more worthwhile to use a
			 * greater search depth on the initial match.
			 *
			 * Note: it's possible to structure the code such that
			 * there's only one call to longest_match(), which
			 * handles both the "find the initial match" and "try to
			 * find a longer match" cases.  However, it is faster to
			 * have two call sites, with longest_match() inlined at
			 * each.
			 */
			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
				max_len = in_end - in_next;
				nice_len = MIN(nice_len, max_len);
			}
			next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
								&in_cur_base,
								in_next,
								cur_len,
								max_len,
								nice_len,
								c->max_search_depth / 2,
								next_hashes,
								&next_offset);
			in_next += 1;

			if (next_len > cur_len) {
				/* Found a longer match at the next position.
				 * Output a literal.  Then the next match
				 * becomes the current match.  */
				deflate_choose_literal(c, *(in_next - 2), &litrunlen);
				cur_len = next_len;
				cur_offset = next_offset;
				goto have_cur_match;
			}

			/* No longer match at the next position.
			 * Output the current match.  */
			deflate_choose_match(c, cur_len, cur_offset,
					     &litrunlen, &next_seq);
			in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
								&in_cur_base,
								in_next,
								in_end,
								cur_len - 2,
								next_hashes);

			/* Check if it's time to output another block.  */
		} while (in_next < in_max_block_end &&
			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));

		deflate_finish_sequence(next_seq, litrunlen);
		deflate_flush_block(c, &os, in_block_begin,
				    in_next - in_block_begin,
				    in_next == in_end, false);
	} while (in_next != in_end);

	return deflate_flush_output(&os);
}

#if SUPPORT_NEAR_OPTIMAL_PARSING

/*
 * Follow the minimum-cost path in the graph of possible match/literal choices
 * for the current block and compute the frequencies of the Huffman symbols that
 * would be needed to output those matches and literals.
 */
static void
deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
{
	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
	do {
		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;

		if (length == 1) {
			/* Literal  */
			c->freqs.litlen[offset]++;
		} else {
			/* Match  */
			c->freqs.litlen[257 + deflate_length_slot[length]]++;
			c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
		}
		cur_node += length;
	} while (cur_node != end_node);
}

/* Set the current cost model from the codeword lengths specified in @lens.  */
static void
deflate_set_costs_from_codes(struct libdeflate_compressor *c,
			     const struct deflate_lens *lens)
{
	unsigned i;

	/* Literals  */
	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
		c->p.n.costs.literal[i] = bits << COST_SHIFT;
	}

	/* Lengths  */
	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
		unsigned length_slot = deflate_length_slot[i];
		unsigned litlen_sym = 257 + length_slot;
		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
		bits += deflate_extra_length_bits[length_slot];
		c->p.n.costs.length[i] = bits << COST_SHIFT;
	}

	/* Offset slots  */
	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
		bits += deflate_extra_offset_bits[i];
		c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
	}
}

static forceinline u32
deflate_default_literal_cost(unsigned literal)
{
	STATIC_ASSERT(COST_SHIFT == 3);
	/* 66 is 8.25 bits/symbol  */
	return 66;
}

static forceinline u32
deflate_default_length_slot_cost(unsigned length_slot)
{
	STATIC_ASSERT(COST_SHIFT == 3);
	/* 60 is 7.5 bits/symbol  */
	return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
}

static forceinline u32
deflate_default_offset_slot_cost(unsigned offset_slot)
{
	STATIC_ASSERT(COST_SHIFT == 3);
	/* 39 is 4.875 bits/symbol  */
	return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
}

/*
 * Set default symbol costs for the first block's first optimization pass.
 *
 * It works well to assume that each symbol is equally probable.  This results
 * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
 * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
 * alphabet.  However, we intentionally bias the parse towards matches rather
 * than literals by using a slightly lower default cost for length symbols than
 * for literals.  This often improves the compression ratio slightly.
 */
static void
deflate_set_default_costs(struct libdeflate_compressor *c)
{
	unsigned i;

	/* Literals  */
	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
		c->p.n.costs.literal[i] = deflate_default_literal_cost(i);

	/* Lengths  */
	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
		c->p.n.costs.length[i] = deflate_default_length_slot_cost(
						deflate_length_slot[i]);

	/* Offset slots  */
	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
		c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
}

static forceinline void
deflate_adjust_cost(u32 *cost_p, u32 default_cost)
{
	*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
}

/*
 * Adjust the costs when beginning a new block.
 *
 * Since the current costs have been optimized for the data, it's undesirable to
 * throw them away and start over with the default costs.  At the same time, we
 * don't want to bias the parse by assuming that the next block will be similar
 * to the current block.  As a compromise, make the costs closer to the
 * defaults, but don't simply set them to the defaults.
 */
static void
deflate_adjust_costs(struct libdeflate_compressor *c)
{
	unsigned i;

	/* Literals  */
	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
		deflate_adjust_cost(&c->p.n.costs.literal[i],
				    deflate_default_literal_cost(i));

	/* Lengths  */
	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
		deflate_adjust_cost(&c->p.n.costs.length[i],
				    deflate_default_length_slot_cost(
						deflate_length_slot[i]));

	/* Offset slots  */
	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
				    deflate_default_offset_slot_cost(i));
}

/*
 * Find the minimum-cost path through the graph of possible match/literal
 * choices for this block.
 *
 * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
 * represents the node at the beginning of the block, to
 * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
 * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
 *
 * The algorithm works backwards, starting at the end node and proceeding
 * backwards one node at a time.  At each node, the minimum cost to reach the
 * end node is computed and the match/literal choice that begins that path is
 * saved.
 */
static void
deflate_find_min_cost_path(struct libdeflate_compressor *c,
			   const u32 block_length,
			   const struct lz_match *cache_ptr)
{
	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
	struct deflate_optimum_node *cur_node = end_node;

	cur_node->cost_to_end = 0;
	do {
		unsigned num_matches;
		unsigned literal;
		u32 best_cost_to_end;

		cur_node--;
		cache_ptr--;

		num_matches = cache_ptr->length;
		literal = cache_ptr->offset;

		/* It's always possible to choose a literal.  */
		best_cost_to_end = c->p.n.costs.literal[literal] +
				   (cur_node + 1)->cost_to_end;
		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;

		/* Also consider matches if there are any.  */
		if (num_matches) {
			const struct lz_match *match;
			unsigned len;
			unsigned offset;
			unsigned offset_slot;
			u32 offset_cost;
			u32 cost_to_end;

			/*
			 * Consider each length from the minimum
			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
			 * match found at this position.  For each length, we
			 * consider only the smallest offset for which that
			 * length is available.  Although this is not guaranteed
			 * to be optimal due to the possibility of a larger
			 * offset costing less than a smaller offset to code,
			 * this is a very useful heuristic.
			 */
			match = cache_ptr - num_matches;
			len = DEFLATE_MIN_MATCH_LEN;
			do {
				offset = match->offset;
				offset_slot = deflate_get_offset_slot(c, offset);
				offset_cost = c->p.n.costs.offset_slot[offset_slot];
				do {
					cost_to_end = offset_cost +
						      c->p.n.costs.length[len] +
						      (cur_node + len)->cost_to_end;
					if (cost_to_end < best_cost_to_end) {
						best_cost_to_end = cost_to_end;
						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
					}
				} while (++len <= match->length);
			} while (++match != cache_ptr);
			cache_ptr -= num_matches;
		}
		cur_node->cost_to_end = best_cost_to_end;
	} while (cur_node != &c->p.n.optimum_nodes[0]);
}

/*
 * Choose the literal/match sequence to use for the current block.  The basic
 * algorithm finds a minimum-cost path through the block's graph of
 * literal/match choices, given a cost model.  However, the cost of each symbol
 * is unknown until the Huffman codes have been built, but at the same time the
 * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
 * multiple passes must be used to try to approximate an optimal solution.  The
 * first pass uses default costs, mixed with the costs from the previous block
 * if any.  Later passes use the Huffman codeword lengths from the previous pass
 * as the costs.
 */
static void
deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
		       const struct lz_match *cache_ptr, bool is_first_block)
{
	unsigned num_passes_remaining = c->p.n.num_optim_passes;
	u32 i;

	/* Force the block to really end at the desired length, even if some
	 * matches extend beyond it. */
	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;

	/* Set the initial costs. */
	if (is_first_block)
		deflate_set_default_costs(c);
	else
		deflate_adjust_costs(c);

	for (;;) {
		/* Find the minimum cost path for this pass. */
		deflate_find_min_cost_path(c, block_length, cache_ptr);

		/* Compute frequencies of the chosen symbols. */
		deflate_reset_symbol_frequencies(c);
		deflate_tally_item_list(c, block_length);

		if (--num_passes_remaining == 0)
			break;

		/* At least one optimization pass remains; update the costs. */
		deflate_make_huffman_codes(&c->freqs, &c->codes);
		deflate_set_costs_from_codes(c, &c->codes.lens);
	}
}

/*
 * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
 * representation of each DEFLATE block using a minimum-cost path search over
 * the graph of possible match/literal choices for that block, assuming a
 * certain cost for each Huffman symbol.
 *
 * For several reasons, the end result is not guaranteed to be optimal:
 *
 * - Nonoptimal choice of blocks
 * - Heuristic limitations on which matches are actually considered
 * - Symbol costs are unknown until the symbols have already been chosen
 *   (so iterative optimization must be used)
 */
static size_t
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
			      const u8 * restrict in, size_t in_nbytes,
			      u8 * restrict out, size_t out_nbytes_avail)
{
	const u8 *in_next = in;
	const u8 *in_end = in_next + in_nbytes;
	struct deflate_output_bitstream os;
	const u8 *in_cur_base = in_next;
	const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
	unsigned nice_len = MIN(c->nice_match_length, max_len);
	u32 next_hashes[2] = {0, 0};

	deflate_init_output(&os, out, out_nbytes_avail);
	bt_matchfinder_init(&c->p.n.bt_mf);

	do {
		/* Starting a new DEFLATE block.  */

		struct lz_match *cache_ptr = c->p.n.match_cache;
		const u8 * const in_block_begin = in_next;
		const u8 * const in_max_block_end =
			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
		const u8 *next_observation = in_next;

		init_block_split_stats(&c->split_stats);

		/*
		 * Find matches until we decide to end the block.  We end the
		 * block if any of the following is true:
		 *
		 * (1) Maximum block length has been reached
		 * (2) Match catch may overflow.
		 * (3) Block split heuristic says to split now.
		 */
		do {
			struct lz_match *matches;
			unsigned best_len;

			/* Slide the window forward if needed.  */
			if (in_next == in_next_slide) {
				bt_matchfinder_slide_window(&c->p.n.bt_mf);
				in_cur_base = in_next;
				in_next_slide = in_next + MIN(in_end - in_next,
							      MATCHFINDER_WINDOW_SIZE);
			}

			/* Decrease the maximum and nice match lengths if we're
			 * approaching the end of the input buffer.  */
			if (unlikely(max_len > in_end - in_next)) {
				max_len = in_end - in_next;
				nice_len = MIN(nice_len, max_len);
			}

			/*
			 * Find matches with the current position using the
			 * binary tree matchfinder and save them in
			 * 'match_cache'.
			 *
			 * Note: the binary tree matchfinder is more suited for
			 * optimal parsing than the hash chain matchfinder.  The
			 * reasons for this include:
			 *
			 * - The binary tree matchfinder can find more matches
			 *   in the same number of steps.
			 * - One of the major advantages of hash chains is that
			 *   skipping positions (not searching for matches at
			 *   them) is faster; however, with optimal parsing we
			 *   search for matches at almost all positions, so this
			 *   advantage of hash chains is negated.
			 */
			matches = cache_ptr;
			best_len = 0;
			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
				cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
								       in_cur_base,
								       in_next - in_cur_base,
								       max_len,
								       nice_len,
								       c->max_search_depth,
								       next_hashes,
								       &best_len,
								       matches);
			}

			if (in_next >= next_observation) {
				if (best_len >= 4) {
					observe_match(&c->split_stats, best_len);
					next_observation = in_next + best_len;
				} else {
					observe_literal(&c->split_stats, *in_next);
					next_observation = in_next + 1;
				}
			}

			cache_ptr->length = cache_ptr - matches;
			cache_ptr->offset = *in_next;
			in_next++;
			cache_ptr++;

			/*
			 * If there was a very long match found, don't cache any
			 * matches for the bytes covered by that match.  This
			 * avoids degenerate behavior when compressing highly
			 * redundant data, where the number of matches can be
			 * very large.
			 *
			 * This heuristic doesn't actually hurt the compression
			 * ratio very much.  If there's a long match, then the
			 * data must be highly compressible, so it doesn't
			 * matter much what we do.
			 */
			if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
				--best_len;
				do {
					if (in_next == in_next_slide) {
						bt_matchfinder_slide_window(&c->p.n.bt_mf);
						in_cur_base = in_next;
						in_next_slide = in_next + MIN(in_end - in_next,
									      MATCHFINDER_WINDOW_SIZE);
					}
					if (unlikely(max_len > in_end - in_next)) {
						max_len = in_end - in_next;
						nice_len = MIN(nice_len, max_len);
					}
					if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
						bt_matchfinder_skip_position(&c->p.n.bt_mf,
									     in_cur_base,
									     in_next - in_cur_base,
									     nice_len,
									     c->max_search_depth,
									     next_hashes);
					}
					cache_ptr->length = 0;
					cache_ptr->offset = *in_next;
					in_next++;
					cache_ptr++;
				} while (--best_len);
			}
		} while (in_next < in_max_block_end &&
			 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));

		/* All the matches for this block have been cached.  Now choose
		 * the sequence of items to output and flush the block.  */
		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
				       in_block_begin == in);
		deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
				    in_next == in_end, true);
	} while (in_next != in_end);

	return deflate_flush_output(&os);
}

#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */

/* Initialize c->offset_slot_fast.  */
static void
deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
{
	unsigned offset_slot;
	unsigned offset;
	unsigned offset_end;

	for (offset_slot = 0;
	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
	     offset_slot++)
	{
		offset = deflate_offset_slot_base[offset_slot];
	#if USE_FULL_OFFSET_SLOT_FAST
		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
		do {
			c->offset_slot_fast[offset] = offset_slot;
		} while (++offset != offset_end);
	#else
		if (offset <= 256) {
			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
			do {
				c->offset_slot_fast[offset - 1] = offset_slot;
			} while (++offset != offset_end);
		} else {
			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
			do {
				c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
			} while ((offset += (1 << 7)) != offset_end);
		}
	#endif
	}
}

LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_compressor(int compression_level)
{
	struct libdeflate_compressor *c;
	size_t size = offsetof(struct libdeflate_compressor, p);

	if (compression_level < 0 || compression_level > 12)
		return NULL;

#if SUPPORT_NEAR_OPTIMAL_PARSING
	if (compression_level >= 8)
		size += sizeof(c->p.n);
	else if (compression_level >= 1)
		size += sizeof(c->p.g);
#else
	if (compression_level >= 1)
		size += sizeof(c->p.g);
#endif

	c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
	if (!c)
		return NULL;

	c->compression_level = compression_level;

	/*
	 * The higher the compression level, the more we should bother trying to
	 * compress very small inputs.
	 */
	c->min_size_to_compress = 56 - (compression_level * 4);

	switch (compression_level) {
	case 0:
		c->impl = deflate_compress_none;
		break;
	case 1:
		c->impl = deflate_compress_greedy;
		c->max_search_depth = 2;
		c->nice_match_length = 8;
		break;
	case 2:
		c->impl = deflate_compress_greedy;
		c->max_search_depth = 6;
		c->nice_match_length = 10;
		break;
	case 3:
		c->impl = deflate_compress_greedy;
		c->max_search_depth = 12;
		c->nice_match_length = 14;
		break;
	case 4:
		c->impl = deflate_compress_greedy;
		c->max_search_depth = 24;
		c->nice_match_length = 24;
		break;
	case 5:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 20;
		c->nice_match_length = 30;
		break;
	case 6:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 40;
		c->nice_match_length = 65;
		break;
	case 7:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 100;
		c->nice_match_length = 130;
		break;
#if SUPPORT_NEAR_OPTIMAL_PARSING
	case 8:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 12;
		c->nice_match_length = 20;
		c->p.n.num_optim_passes = 1;
		break;
	case 9:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 16;
		c->nice_match_length = 26;
		c->p.n.num_optim_passes = 2;
		break;
	case 10:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 30;
		c->nice_match_length = 50;
		c->p.n.num_optim_passes = 2;
		break;
	case 11:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 60;
		c->nice_match_length = 80;
		c->p.n.num_optim_passes = 3;
		break;
	default:
		c->impl = deflate_compress_near_optimal;
		c->max_search_depth = 100;
		c->nice_match_length = 133;
		c->p.n.num_optim_passes = 4;
		break;
#else
	case 8:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 150;
		c->nice_match_length = 200;
		break;
	default:
		c->impl = deflate_compress_lazy;
		c->max_search_depth = 200;
		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
		break;
#endif
	}

	deflate_init_offset_slot_fast(c);
	deflate_init_static_codes(c);

	return c;
}

LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress(struct libdeflate_compressor *c,
			    const void *in, size_t in_nbytes,
			    void *out, size_t out_nbytes_avail)
{
	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
		return 0;

	/* For extremely small inputs just use a single uncompressed block. */
	if (unlikely(in_nbytes < c->min_size_to_compress)) {
		struct deflate_output_bitstream os;
		deflate_init_output(&os, out, out_nbytes_avail);
		if (in_nbytes == 0)
			in = &os; /* Avoid passing NULL to memcpy() */
		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
		return deflate_flush_output(&os);
	}

	return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
}

LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_compressor(struct libdeflate_compressor *c)
{
	libdeflate_aligned_free(c);
}

unsigned int
deflate_get_compression_level(struct libdeflate_compressor *c)
{
	return c->compression_level;
}

LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
				  size_t in_nbytes)
{
	/*
	 * The worst case is all uncompressed blocks where one block has length
	 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
	 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
	 */
	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
}