2022-12-28 11:58:31 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
|
|
from datetime import datetime
|
|
|
|
import aioredis
|
|
|
|
import asyncio
|
|
|
|
from aiocsv import AsyncReader
|
|
|
|
import aiofiles
|
|
|
|
|
|
|
|
'''
|
|
|
|
To install: pip install -r requirements.txt
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
class Command:
|
|
|
|
args = None
|
2022-12-29 14:01:01 +00:00
|
|
|
sync_id = 0 # Commands with the same sync_id will be executed synchrnously
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
class TwitterCacheTraceParser:
|
|
|
|
"""
|
|
|
|
https://github.com/twitter/cache-trace
|
|
|
|
"""
|
|
|
|
def parse(self, csv) -> Command:
|
|
|
|
operation = csv[5]
|
|
|
|
key = csv[1] + "a"
|
|
|
|
value_size = int(csv[3])
|
|
|
|
synthetic_value = "".zfill(value_size)
|
|
|
|
|
|
|
|
client_id = csv[4]
|
|
|
|
ttl = csv[6]
|
|
|
|
|
|
|
|
cmd = Command()
|
2022-12-29 14:01:01 +00:00
|
|
|
cmd.sync_id = client_id
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
if operation == "get":
|
|
|
|
cmd.args = ["GET", key]
|
|
|
|
elif operation == 'gets':
|
|
|
|
cmd.args = ["GET", key]
|
|
|
|
elif operation == 'set':
|
|
|
|
cmd.args = ["SET", key, synthetic_value]
|
|
|
|
elif operation == 'add':
|
|
|
|
cmd.args = ["SET", key, synthetic_value]
|
|
|
|
elif operation == 'replace':
|
|
|
|
cmd.args = ["SET", key, synthetic_value]
|
|
|
|
elif operation == 'cas':
|
|
|
|
cmd.args = ["SET", key, synthetic_value]
|
|
|
|
elif operation == 'append':
|
2022-12-28 14:04:53 +00:00
|
|
|
cmd.args = ["APPEND", key, synthetic_value]
|
2022-12-28 11:58:31 +00:00
|
|
|
elif operation == 'prepend':
|
|
|
|
cmd.args = ["SET", key, synthetic_value]
|
|
|
|
elif operation == 'delete':
|
|
|
|
cmd.args = ["DEL", key]
|
|
|
|
elif operation == 'incr':
|
|
|
|
cmd.args = ["INCR", key]
|
|
|
|
elif operation == 'decr':
|
|
|
|
cmd.args = ["DECR", key]
|
|
|
|
|
|
|
|
return cmd
|
|
|
|
|
|
|
|
class AsyncWorker:
|
|
|
|
QUEUE_SIZE = 100000
|
|
|
|
|
|
|
|
def __init__(self, redis_client) -> None:
|
|
|
|
self.queue = asyncio.Queue(self.QUEUE_SIZE)
|
|
|
|
self.redis_client = redis_client
|
|
|
|
self.working = False
|
|
|
|
|
2022-12-29 14:01:01 +00:00
|
|
|
async def put(self, batch: list) -> None:
|
|
|
|
await self.queue.put(batch)
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
async def work(self) -> None:
|
|
|
|
self.working = True
|
2022-12-29 14:01:01 +00:00
|
|
|
while self.working or not self.queue.empty() :
|
|
|
|
batch = await self.queue.get()
|
|
|
|
await self.execute(batch)
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
async def execute(self, batch) -> None:
|
|
|
|
async with self.redis_client.pipeline(transaction=False) as pipe:
|
|
|
|
for cmd in batch:
|
|
|
|
pipe.execute_command(*cmd.args)
|
|
|
|
await pipe.execute()
|
|
|
|
|
|
|
|
def start(self) -> asyncio.Task:
|
|
|
|
return asyncio.create_task(self.work())
|
|
|
|
|
|
|
|
def stop(self) -> None:
|
|
|
|
self.working = False
|
|
|
|
|
|
|
|
class AsyncWorkerPool:
|
|
|
|
"""
|
|
|
|
Mangaes worker pool to send commands in parallel
|
2022-12-29 14:01:01 +00:00
|
|
|
Maintains synchronous order for commands with the same sync_id
|
2022-12-28 11:58:31 +00:00
|
|
|
"""
|
|
|
|
def __init__(self, redis_client, num_workers) -> None:
|
|
|
|
self.redis_client = redis_client
|
|
|
|
self.num_workers = num_workers
|
|
|
|
self.workers = []
|
|
|
|
self.tasks = []
|
2022-12-29 14:01:01 +00:00
|
|
|
self.sync_id_to_worker = {}
|
2022-12-28 11:58:31 +00:00
|
|
|
self.next_worker_index = -1
|
|
|
|
|
2022-12-29 14:01:01 +00:00
|
|
|
def allocate(self, sync_id) -> AsyncWorker:
|
|
|
|
if not sync_id in self.sync_id_to_worker:
|
2022-12-28 11:58:31 +00:00
|
|
|
self.next_worker_index = (self.next_worker_index + 1) % self.num_workers
|
|
|
|
|
|
|
|
if len(self.workers) <= self.next_worker_index:
|
|
|
|
assert len(self.workers) == self.next_worker_index
|
|
|
|
self.workers.append(AsyncWorker(self.redis_client))
|
|
|
|
self.tasks.append(self.workers[self.next_worker_index].start())
|
|
|
|
|
2022-12-29 14:01:01 +00:00
|
|
|
self.sync_id_to_worker[sync_id] = self.workers[self.next_worker_index]
|
2022-12-28 11:58:31 +00:00
|
|
|
|
2022-12-29 14:01:01 +00:00
|
|
|
return self.sync_id_to_worker[sync_id]
|
2022-12-28 11:58:31 +00:00
|
|
|
|
2022-12-29 14:01:01 +00:00
|
|
|
async def put(self, batch: list, sync_id: int) -> None:
|
|
|
|
worker = self.allocate(sync_id)
|
|
|
|
await worker.put(batch)
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
async def stop(self):
|
|
|
|
for worker in self.workers:
|
|
|
|
worker.stop()
|
|
|
|
await asyncio.gather(*self.tasks)
|
|
|
|
|
|
|
|
|
|
|
|
class AsyncPlayer:
|
2022-12-29 14:01:01 +00:00
|
|
|
READ_BATCH_SIZE = 10 * 1000 * 1000
|
|
|
|
|
2022-12-28 11:58:31 +00:00
|
|
|
def __init__(self, redis_uri, num_workers) -> None:
|
|
|
|
self.redis_uri = redis_uri
|
|
|
|
self.redis_client = aioredis.from_url(f"redis://{self.redis_uri}", encoding="utf-8", decode_responses=True)
|
|
|
|
self.worker_pool = AsyncWorkerPool(self.redis_client, 100)
|
|
|
|
|
2022-12-29 14:01:01 +00:00
|
|
|
self.batch_by_sync_id = {}
|
|
|
|
|
|
|
|
async def dispatch_batches(self):
|
|
|
|
for sync_id in self.batch_by_sync_id:
|
|
|
|
await self.worker_pool.put(self.batch_by_sync_id[sync_id], sync_id)
|
|
|
|
self.batch_by_sync_id.clear()
|
|
|
|
|
2022-12-28 11:58:31 +00:00
|
|
|
async def read_and_dispatch(self, csv_file, parser):
|
|
|
|
print(f"dispatching from {csv_file}")
|
2022-12-29 14:01:01 +00:00
|
|
|
|
|
|
|
line_count = 0
|
|
|
|
|
2022-12-28 11:58:31 +00:00
|
|
|
async with aiofiles.open(csv_file, mode="r", encoding="utf-8", newline="") as afp:
|
|
|
|
async for row in AsyncReader(afp):
|
2022-12-29 14:01:01 +00:00
|
|
|
cmd = parser.parse(row)
|
|
|
|
if not self.batch_by_sync_id.get(cmd.sync_id):
|
|
|
|
self.batch_by_sync_id[cmd.sync_id] = []
|
|
|
|
batch = self.batch_by_sync_id[cmd.sync_id]
|
|
|
|
batch.append(cmd)
|
|
|
|
line_count = line_count + 1
|
|
|
|
if (line_count >= self.READ_BATCH_SIZE):
|
|
|
|
await self.dispatch_batches()
|
|
|
|
line_count = 0
|
|
|
|
# handle the remaining lines
|
|
|
|
await self.dispatch_batches()
|
|
|
|
|
2022-12-30 09:29:11 +00:00
|
|
|
async def print_stats(self):
|
|
|
|
info = await self.redis_client.execute_command("info", "stats")
|
|
|
|
print(f"{datetime.now()}: {info}")
|
|
|
|
|
2022-12-29 14:01:01 +00:00
|
|
|
async def report_stats(self):
|
2022-12-28 11:58:31 +00:00
|
|
|
while True:
|
2022-12-30 09:29:11 +00:00
|
|
|
self.print_stats()
|
2022-12-28 11:58:31 +00:00
|
|
|
|
2022-12-30 09:29:11 +00:00
|
|
|
async def report_stats(self):
|
|
|
|
while True:
|
|
|
|
await self.print_stats()
|
|
|
|
await asyncio.sleep(10)
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
async def play(self, csv_file, parser) -> None:
|
|
|
|
print(f"pinging {self.redis_uri} successful?")
|
|
|
|
print(await self.redis_client.ping())
|
|
|
|
|
|
|
|
read_dispatch_task = asyncio.create_task(self.read_and_dispatch(csv_file, parser))
|
2022-12-29 14:01:01 +00:00
|
|
|
stats_task = asyncio.create_task(self.report_stats())
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
await read_dispatch_task
|
|
|
|
print(f"finished reading {csv_file}")
|
|
|
|
|
|
|
|
await self.worker_pool.stop()
|
|
|
|
stats_task.cancel()
|
|
|
|
print("all done")
|
2022-12-30 09:29:11 +00:00
|
|
|
await self.print_stats()
|
2022-12-28 11:58:31 +00:00
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(description='Cache Logs Player')
|
|
|
|
parser.add_argument('-u', '--uri', type=str, default='localhost:6379', help='Redis server URI')
|
|
|
|
parser.add_argument('-f', '--csv_file', type=str, default='/home/ari/Downloads/cluster017.csv', help='Redis server URI')
|
|
|
|
parser.add_argument('--num_workers', type=int, default=100, help='Maximum number of workers sending commands in parllel')
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
player = AsyncPlayer(redis_uri=args.uri, num_workers=args.num_workers)
|
|
|
|
asyncio.run(player.play(args.csv_file, TwitterCacheTraceParser()))
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|