# code from https://platform.openai.com/docs/tutorials/web-qa-embeddings import pandas as pd import tiktoken max_tokens = 500 # Function to split the text into chunks of a maximum number of tokens def split_into_many(text, max_tokens = max_tokens): # Split the text into sentences sentences = text.split('. ') # Get the number of tokens for each sentence n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] chunks = [] tokens_so_far = 0 chunk = [] # Loop through the sentences and tokens joined together in a tuple for sentence, token in zip(sentences, n_tokens): # If the number of tokens so far plus the number of tokens in the current sentence is greater # than the max number of tokens, then add the chunk to the list of chunks and reset # the chunk and tokens so far if tokens_so_far + token > max_tokens: chunks.append(". ".join(chunk) + ".") chunk = [] tokens_so_far = 0 # If the number of tokens in the current sentence is greater than the max number of # tokens, go to the next sentence if token > max_tokens: continue # Otherwise, add the sentence to the chunk and add the number of tokens to the total chunk.append(sentence) tokens_so_far += token + 1 return chunks def remove_newlines(serie): serie = serie.str.replace('\n', ' ') serie = serie.str.replace('\\n', ' ') serie = serie.str.replace(' ', ' ') serie = serie.str.replace(' ', ' ') return serie class EmbeddingsBuilder: def __init__(self): pass def txt2csv(self, sourcePath, csvfile): # Create a list to store the text files texts=[] # Get all the text files in the text directory for file in os.listdir(sourcePath): # Open the file and read the text with open(sourcePath + file, "r", encoding="UTF-8") as f: text = f.read() # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces. texts.append((file, text)) # Create a dataframe from the list of texts df = pd.DataFrame(texts, columns = ['title', 'text']) # Set the text column to be the raw text with the newlines removed df['text'] = df.fname + ". " + remove_newlines(df.text) # Load the cl100k_base tokenizer which is designed to work with the ada-002 model tokenizer = tiktoken.get_encoding("cl100k_base") # Tokenize the text and save the number of tokens to a new column df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) shortened = [] # Loop through the dataframe for row in df.iterrows(): # If the text is None, go to the next row if row[1]['text'] is None: continue # If the number of tokens is greater than the max number of tokens, split the text into chunks if row[1]['n_tokens'] > max_tokens: shortened += split_into_many(row[1]['text']) # Otherwise, add the text to the list of shortened texts else: shortened.append( row[1]['text'] ) df = pd.DataFrame(shortened, columns = ['text']) df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) df.to_csv(csvfile)