Skip to content Skip to sidebar Skip to footer

What Is The Fastest Way To Search The Csv File?

Task: Check the availability of the series and passport number in the file. My decision is as follows: def check_passport(filename, series: str, number: str) -> dict: '''

Solution 1:

Examined three solutions

  1. CSV line-by-line method of the original post
  2. Using Raw Text and not using CSV reader to partition into CSV fields
  3. Using Pandas to Read and Process Data in Chunks

Results Test performed using 10 million to 30 million rows. enter image description here

Summary Using Pandas was the slowest method. This is not surprising when the observation of this article is considered (i.e. Pandas is one of the slower methods of reading in CSV files due to its overhead). The fastest was processing the file as a raw text file and looking for the numbers in the raw text (~2x faster than the originally posted method of using the CSV reader). Pandas were ~30% slower than the original method.

Test Code

import timeit
import time
import random
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import math
import itertools

def wrapper(func, *args, **kwargs):
    " Use to produced 0 argument function for call it"# Reference https://www.pythoncentral.io/time-a-python-function/
    def wrapped():
        return func(*args, **kwargs)
    return wrapped

def method_line_by_line(filename, series: str, number: str) -> dict:
    """
    Find passport number and series
    :param filename:csv filename path
    :param series: passport series
    :param number: passport number
    :return:
    """
    find = False
    with open(filename, 'r', encoding='utf_8_sig') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        try:
            for row in reader:
                if row[0] == series and row[1] == num:
                    find = Truebreak
        except Exceptionas e:
            pass

    if find:
        return {'result': True, 'message': 'Passport found'}
    else:
        return {'result': False, 'message': 'Passport not found in Database'}

def method_raw_text(filename, series: str, number: str) -> dict:
    """
    Find passport number and series by interating through text records
    :param filename:csv filename path
    :param series: passport series
    :param number: passport number
    :return:
    """
    pattern = series + "," + number
    with open(filename, 'r', encoding='utf_8_sig') as csvfile:
        if any(map(lambda x: pattern == x.rstrip(), csvfile)): # iterates through text looking for matchreturn {'result': True, 'message': 'Passport found'}
        else:
            return {'result': False, 'message': 'Passport not found in Database'}

def method_pandas_chunks(filename, series: str, number: str) -> dict:
    """
    Find passport number and series using Pandas in chunks
    :param filename:csv filename path
    :param series: passport series
    :param number: passport number
    :return:
    """
    chunksize = 10 ** 5for df in pd.read_csv(filename, chunksize=chunksize, dtype={'PASSP_SERIES': str,'PASSP_NUMBER':str}):

      df_search = df[(df['PASSP_SERIES'] == series) & (df['PASSP_NUMBER'] == number)]

      if not df_search.empty:
        breakif not df_search.empty:
        return {'result': True, 'message': 'Passport found'}
    else:
        return {'result': False, 'message': 'Passport not found in Database'}

def generate_data(filename, number_records):
    " Generates random data for tests"
    df = pd.DataFrame(np.random.randint(0, 1e6,size=(number_records, 2)), columns=['PASSP_SERIES', 'PASSP_NUMBER'])
    df.to_csv(filename, index = None, header=True)
    return df

def profile():
    Nls = [x for x in range(10000000, 30000001, 5000000)] # range of number of test rows
    number_iterations = 3# repeats per test
    methods = [method_line_by_line, method_raw_text, method_pandas_chunks]
    time_methods = [[] for _ in range(len(methods))]

    for N in Nls:
        # Generate CSV File with N rows
        generate_data('test.csv', N)

        for i, func in enumerate(methods):
            wrapped = wrapper(func, 'test.csv', 'x', 'y') # Use x & y to ensure we process entire# file without finding a match
            time_methods[i].append(math.log(timeit.timeit(wrapped, number=number_iterations)))

    markers = itertools.cycle(('o', '+', '.'))
    colors = itertools.cycle(('r', 'b', 'g'))
    labels = itertools.cycle(('line-by-line', 'raw-text', 'pandas'))
    print(time_methods)
    for i in range(len(time_methods)):
        plt.plot(Nls,time_methods[i],marker = next(markers),color=next(colors),linestyle='-',label=next(labels))

    plt.xlabel('list size', fontsize=18)
    plt.ylabel('log(time)', fontsize=18)
    plt.legend(loc = 'upper left')
    plt.show()

# Run Test
profile()

Solution 2:

CSV file format is a convenient and simple file format.

It is not intended for analysis / fast searching , This was never the goal. It is good for exchange between different applications and tasks where you have to process all entries or where the amount of entries is not very huge.

If you want to speed up you should read the CSV file once and convert it to a database e.g. sqlite and then perform all the searches in the data base. if password numbers are unique, then you could even just use a simple dbm file or a python shelve.

Database performance can be optimized by adding indexes to fields, that you search for.

It all depends how often the CSV file changes and how often you perform searches, but often this approach should yield better results.

I never really used pandas, but perhaps it is more performant for searching / filtering, though it will never beat searching in a real database.

If you want to go down the sqlite or dbm road I can help with some code.

Addendum (search in a sorted csv file with bisect search prior to reading with a csv reader):

If the first field in your csv file is the serial number, then there is another approach. (or if you are willing to transform the csv file such, that it can be sorted with gnu sort)

Just sort your file (easy to do with a gnu sort on a linux system. It can sort huge files without 'explosing' the memory) and the sorting time should not be much higher then the search time that you are having at the moment.

And use then a bisect / seek search in your file for the first line with the right serial number. Then use your existing function with a minor modification.

This will give you results within a few milliseconds. I tried with a randomly created csv file with 30 million entries and a size of about 1.5G.

If running on a linux system you could even change your code such, that it creates a sorted copy of the csv file, that you downloaded, whenever the csv file changed. (Sorting on my machine needed about 1 to 2 minutes) So after 2 to 3 searches per week this would be worth the effort.

import csv
import datetime
import os

def get_line_at_pos(fin, pos):
    """ fetches first complete line at offset pos
        always skips header line
    """
    fin.seek(pos)
    skip = fin.readline()
    # next line for debugging only# print("Skip@%d: %r" % (pos, skip))
    npos = fin.tell()
    assert pos + len(skip) == npos
    line = fin.readline()
    return npos, line


def bisect_seek(fname, field_func, field_val):
    """ returns a file postion, which guarantees, that you will
        encounter all lines, that migth encounter field_val
        if the file is ordered by field_val.
        field_func is the function to extract field_val from a line
        The search is a bisect search, with a complexity of log(n)
    """
    size = os.path.getsize(fname)
    minpos, maxpos, cur = 0, size, int(size / 2)

    with open(fname) as fin:
        small_pos = 0# next line just for debuggingstate = "?"
        prev_pos = -1while True:  # find first id smaller than the one we search# next line just for debugging
            pos_str = "%s %10d %10d %10d" % (state, minpos, cur, maxpos)
            realpos, line = get_line_at_pos(fin, cur)
            val = field_func(line)
            # next line just for debugging
            pos_str += "# got @%d: %r %r" % (realpos, val, line)
            if val >= field_val:
                state = ">"
                maxpos = cur
                cur = int((minpos + cur) / 2)
            else:
                state = "<"
                minpos = cur
                cur = int((cur + maxpos) / 2)
            # next line just for debugging# print(pos_str)if prev_pos == cur:
                break
            prev_pos = cur
    return realpos


def getser(line):
    return line.split(",")[0]


def check_passport(filename, series: str, number: str) -> dict:
    """
    Find passport number and series
    :param filename:csv filename path
    :param series: passport series
    :param number: passport number
    :return:
    """print(f'series={series}, number={number}')
    found = False
    start = datetime.datetime.now()
    # find position from which we should start searchingpos = bisect_seek(filename, getser, series)
    with open(filename, 'r', encoding='utf_8_sig') as csvfile:
        csvfile.seek(pos)
        reader = csv.reader(csvfile, delimiter=',')
        try:
            for row in reader:
                if row[0] == series and row[1] == number:
                    found = True
                    break
                elif row[0] > series:
                    # as file is sorted we know we can abort nowbreak
        except Exception as e:
            print(e)
    print(datetime.datetime.now() - start)
    if found:
        print("good row", row)
        return {'result': True, 'message': f'Passport found'}
    else:
        print("bad row", row)
        return {'result': False, 'message': f'Passport not found in Database'}

Addendum 2019-11-30: Here one script to split your huge file into smaller chunks and sort each of the chunks. (I didn't want to implement a full merge sort as in this context searching in each of the chunks is already efficient enough. if interested in mor I suggest to try to implement a merge sort or post a question about sorting huge files under windows with python)

split_n_sort_csv.py:

import itertools
import sys
import time

def main():
    args = sys.argv[1:]
    t = t0 = time.time()
    with open(args[0]) as fin:
        headline = next(fin)
        for idx in itertools.count():
            print(idx, "r")
            tprev = t
            lines = list(itertools.islice(fin, 10000000))
            t = time.time()
            t_read = t - tprev
            tprev = t
            print("s")
            lines.sort()
            t = time.time()
            t_sort = t - tprev
            tprev = t
            print("w")
            with open("bla_%03d.csv" % idx, "w") as fout:
                fout.write(headline)
                for line inlines:
                    fout.write(line)
            t = time.time()
            t_write = t - tprev
            tprev = t

            print("%4.1f %4.1f %4.1f" % (t_read, t_sort, t_write))
            ifnotlines:
                break
    t = time.time()
    print("Total of %5.1fs" % (t-t0))

if __name__ == "__main__":
    main()

And here a modified version, that searches in all chunk files.

import csv
import datetime
import itertools
import os

ENCODING='utf_8_sig'

def get_line_at_pos(fin, pos, enc_encoding="utf_8"):
    """ fetches first complete line at offset pos
        always skips header line
    """while True:
        fin.seek(pos)
        try:
            skip = fin.readline()
            break
        except UnicodeDecodeError:
            pos += 1# print("Skip@%d: %r" % (pos, skip))
    npos = fin.tell()
    assert pos + len(skip.encode(enc_encoding)) == npos
    line = fin.readline()
    return npos, line

def bisect_seek(fname, field_func, field_val, encoding=ENCODING):
    size = os.path.getsize(fname)
    vmin, vmax, cur = 0, size, int(size / 2)
    if encoding.endswith("_sig"):
        enc_encoding = encoding[:-4]
    else:
        enc_encoding = encoding
    with open(fname, encoding=encoding) as fin:
        small_pos = 0state = "?"
        prev_pos = -1while True:  # find first id smaller than the one we search# next line only for debugging
            pos_str = "%s %10d %10d %10d" % (state, vmin, cur, vmax)
            realpos, line = get_line_at_pos(fin, cur, enc_encoding=enc_encoding)
            val = field_func(line)
            # next line only for debugging
            pos_str += "# got @%d: %r %r" % (realpos, val, line)
            if val >= field_val:
                state = ">"
                vmax = cur
                cur = int((vmin + cur) / 2)
            else:
                state = "<"
                vmin = cur
                cur = int((cur + vmax) / 2)
            # next line only for debugging# print(pos_str)if prev_pos == cur:
                break
            prev_pos = cur
    return realpos

def getser(line):
    return line.split(",")[0]

def check_passport(filename, series: str, number: str,
        encoding=ENCODING) -> dict:
    """
    Find passport number and series
    :param filename:csv filename path
    :param series: passport series
    :param number: passport number
    :return:
    """print(f'series={series}, number={number}')
    found = False
    start = datetime.datetime.now()
    for ctr in itertools.count():
        fname = filename % ctr
        ifnot os.path.exists(fname):
            breakprint(fname)
        pos = bisect_seek(fname, getser, series)
        with open(fname, 'r', encoding=encoding) as csvfile:
            csvfile.seek(pos)
            reader = csv.reader(csvfile, delimiter=',')
            try:
                for row in reader:
                    if row[0] == series and row[1] == number:
                        found = True
                        break
                    elif row[0] > series:
                        break
            except Exception as e:
                print(e)
        if found:
            breakprint(datetime.datetime.now() - start)
    if found:
        print("good row in %s: %d", (fname, row))
        return {'result': True, 'message': f'Passport found'}
    else:
        print("bad row", row)
        return {'result': False, 'message': f'Passport not found in Database'}

To test, call with:

check_passport("bla_%03d.csv", series, number)

Post a Comment for "What Is The Fastest Way To Search The Csv File?"