PROBLEM LINK:
Practice
Contest: Division 1
Contest: Division 2
Contest: Division 3
Contest: Division 4
Author: iceknight1093
Tester: raysh07
Editorialist: iceknight1093
DIFFICULTY:
Medium
PREREQUISITES:
Segment trees
PROBLEM:
You’re given an array A. You can partition it into some M \ge 1 non-empty subarrays, then add 0, 1, 2, \ldots, M-1 to the elements of each of these subarrays in some order (all elements in the same subarray receive the same added value), and then reduce all elements modulo K.
Find the lexicographically smallest final array.
Here, N \le 2\cdot 10^5.
EXPLANATION:
First, read the solution to the easy version.
We continue from there.
The slow part of that solution is the fact that we had to iterate through every possible final value of A_i, and run a check for each one, to decide which one is the best choice.
To solve this higher-constraints version, that’s really the only part that needs to be sped up.
When minimizing A_i, there’s a clear order of optimality.
If we let z = (K - A_i) \bmod K, then it’s definitely optimal to use z if we’re able to (in order to reach 0), otherwise use z+1, z+2, \ldots, K-1 after which it cycles around to being 0, 1, 2, \ldots, z-1.
So, our goal is really to find the first value \ge z (in cyclic order) that is allowed to be used here.
This is essentially a couple of range queries!
First, we will change perspectives a bit.
Suppose we’re at index i. There are N-i elements remaining, each of which can be used for one value \le P.
So, we need to ensure that there are at most N-i unused elements \le P, so far (after fixing A_i).
In particular, if we store a list of all unused elements so far (as in, elements that have not appeared as K\cdot c + x for some c\le hi[x]), then note that the maximum allowed value we can use is the (N-i+1)-th unused element! Anything larger is not possible.
To quickly query for this, we can build a segment tree (or fenwick tree) over values, and for each value maintain whether it’s used or not.
The updates take \mathcal{O}(\log N) time and occur N times, while queries for k-th missing element take \mathcal{O}(\log N) or \mathcal{O}(\log^2 N) depending on implementation.
Now, for each remainder r, let’s store a value mn[r] to be the minimum possible valid value of (maximum forced label) if we are to add r to A_i.
That is, the minimum possible value of the value P.
When we’re at index i, we know from the missing-element query what is the maximum allowed value of P.
So, we can simply search to find the first value \ge z that satisfies this condition!
…Well, not quite - there is in fact some detail here.
In particular, the limit doesn’t depend purely on i.
Specifically, if r + K\cdot hi[r] \le P then this element that was previously unused becomes used; so the the actual limit is the next largest unused element (not the (N-i+1)-th that we found, but the (N-i+2)-th instead.)
So, we can instead build a min-segment tree that stores mn[r] for all r, as well as the values of r + K\cdot hi[r].
You can then use these two values to walk down the segment tree and find the optimal r - the details can be found in the code below.
After finding the optimal r, it’s easy enough to keep all data structures updated.
Note that you might have to also explicitly run a check for whether extending the previous block is optimal or not; which is not too hard after everything above.
TIME COMPLEXITY:
\mathcal{O}(N \log N) per testcase.
CODE:
Tester's code (C++)
#include <bits/stdc++.h>
using namespace std;
#define int long long
#define INF (int)1e18
mt19937_64 RNG(chrono::steady_clock::now().time_since_epoch().count());
struct segtree{
struct node{
int v0 = INF;
int v1 = INF;
void apply(int l, int r, int a0, int a1){
v0 = a0;
v1 = a1;
}
};
int n;
vector <node> seg;
node unite(node a, node b){
node res;
res.v0 = min(a.v0, b.v0);
res.v1 = min(a.v1, b.v1);
return res;
}
void push(int l, int r, int pos){
}
void pull(int pos){
seg[pos] = unite(seg[pos * 2], seg[pos * 2 + 1]);
}
void build(int l, int r, int pos){
if (l == r){
return;
}
int mid = (l + r) / 2;
build(l, mid, pos * 2);
build(mid + 1, r, pos * 2 + 1);
pull(pos);
}
template <typename... M>
void modify(int l, int r, int pos, int ql, int qr, M&... v){
push(l, r, pos);
if (l >= ql && r <= qr){
seg[pos].apply(l, r, v...);
return;
}
int mid = (l + r) / 2;
if (ql <= mid) modify(l, mid, pos * 2, ql, qr, v...);
if (qr > mid) modify(mid + 1, r, pos * 2 + 1, ql, qr, v...);
pull(pos);
}
int get_first(int l, int r, int pos, int ql, int qr, int x0, int x1){
push(l, r, pos);
if (l > qr || r < ql) return -1;
if (!(seg[pos].v0 <= x0 || seg[pos].v1 <= x1)) return -1;
if (l == r) return l;
int mid = (l + r) / 2;
int res = get_first(l, mid, pos * 2, ql, qr, x0, x1);
if (res != -1) return res;
return get_first(mid + 1, r, pos * 2 + 1, ql, qr, x0, x1);
}
segtree (int _n){
n = _n;
seg.resize(4 * n + 1);
build(1, n, 1);
}
template <typename... M>
void modify(int ql, int qr, M&... v){
modify(1, n, 1, ql, qr, v...);
}
int get_first(int ql, int qr, int x0, int x1){
return get_first(1, n, 1, ql, qr, x0, x1);
}
};
struct FenwickTree{
int n;
vector <int> f;
vector <int> b;
inline void add(int i, int x){
if (i > n) return;
b[i] += x;
for (int j = i; j <= n; j += j & (-j)){
f[j] += x;
}
}
inline void modify(int i, int x){
add(i, x - b[i]);
}
inline void init(int nn, vector <int> a){
n = nn;
if ((int)a.size() == n){
vector <int> a2;
a2.push_back(0);
for (auto x : a) a2.push_back(x);
a = a2;
}
f.assign(n + 1, 0);
b.assign(n + 1, 0);
for (int i = 1; i <= n; i++){
modify(i, a[i]);
}
}
inline int query(int x){
int ans = 0;
for (int i = x; i; i -= i & (-i)){
ans += f[i];
}
return ans;
}
inline int query(int l, int r){
return query(r) - query(l - 1);
}
};
void Solve()
{
int n, k;
cin >> n >> k;
vector <int> a(n), b(n);
for (int i = 0; i < n; i++) cin >> a[i];
vector <int> lo(k, 0), hi(k, 0);
segtree seg(k);
for (int r = 0; r < k; r++){
int val = r + 1;
seg.modify(r + 1, r + 1, val, val);
}
FenwickTree fen;
vector <int> vv(n, 1);
fen.init(n, vv);
auto kth = [&](int k){
int l = 1, r = n + 1;
while (l < r){
int mid = (l + r) / 2;
if (fen.query(mid) >= k) r = mid;
else l = mid + 1;
}
return l;
};
int curr_max = 0;
int prev = -1;
for (int i = 0; i < n; i++){
int p0 = kth(n - i) - 1, p1 = kth(n - i + 1) - 1;
int q0 = (curr_max <= p0)? p0 : -1, q1 = (curr_max <= p1)? p1 : -1;
int zero = (k - a[i]) % k;
int r = seg.get_first(zero + 1, k, q0, q1);
if (r == -1) r = seg.get_first(1, zero, q0, q1);
r--;
if (i > 0){
if ((curr_max <= p0) || (max(curr_max, prev + 1 + hi[prev] * k) <= p1)){
if (r < 0 || (a[i] + prev) % k < (a[i] + r) % k){
r = prev;
}
}
}
b[i] = r;
fen.add(r + hi[r] * k + 1, -1);
hi[r]++;
if (r != prev){
curr_max = max(curr_max, r + lo[r] * k + 1);
lo[r]++;
prev = r;
}
int v1 = r + lo[r] * k + 1;
int v2 = r + hi[r] * k + 1;
seg.modify(r + 1, r + 1, v1, v2);
}
for (int i = 0; i < n; i++){
cout << ((a[i] + b[i]) % k) << " \n"[i + 1 == n];
}
}
int32_t main()
{
auto begin = std::chrono::high_resolution_clock::now();
ios_base::sync_with_stdio(0);
cin.tie(0);
int t = 1;
// freopen("in", "r", stdin);
// freopen("out", "w", stdout);
cin >> t;
for(int i = 1; i <= t; i++)
{
//cout << "Case #" << i << ": ";
Solve();
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
cerr << "Time measured: " << elapsed.count() * 1e-9 << " seconds.\n";
return 0;
}