add Huffman compress instances (c2eb96ab) · Commits · Wenchao Zhang / Python-Instances

Huffman Project/Huffman-ReadMe.md

0 → 100644

+18 −0

Original line number	Diff line number	Diff line
		# Compression program using Huffman coding

		This is a python 3 program for the Huffman encoding and decoding.

		Usage:

		1. `python Huffman.py`
		2. choose compression or decompression by enter `1` or `2`.
		3. Enter the path of the file to be processed.

		The compressed file has an additional extension `.hm`. The compression ratio of the given file `dickens.txt` is `56.5%`.



		Reference.

		1. [基于哈夫曼编码的压缩算法的Python实现](https://blog.str-mo.com/tech/153/)

Huffman Project/Huffman.py

0 → 100644

+160 −0

Original line number	Diff line number	Diff line
		# Define node class of Huffman tree
		class node(object):
		def __init__(self,value=None,left=None,right=None,sup=None):
		self.value = value
		self.left = left
		self.right = right
		self.sup = sup

		def con_sup(left,right):
		n = node(value = left.value + right.value,left = left,right = right)
		left.sup = right.sup = n
		return n

		def encode(n):
		if n.sup == None:
		return b''
		if n.sup.left == n:
		return node.encode(n.sup) + b'0' #left node encode as '0'
		else:
		return node.encode(n.sup) + b'1' #right node encode as '1'

		#Construct the Huffman tree
		def con_tree(tree):
		if len(tree) == 1:
		return tree
		sorts = sorted(tree,key = lambda x:x.value)
		n = node.con_sup(sorts[0],sorts[1])
		sorts.pop(0)
		sorts.pop(0)
		sorts.append(n)
		return con_tree(sorts)

		# Output encoding table
		def encode(echo):
		for x in nd_dict.keys():
		ec_dict[x] = node.encode(nd_dict[x])
		if echo == True:
		print(x,ec_dict[x])

		# Define compressing method for files
		def compfile(inputfile):
		print("Starting compress...")
		f = open(inputfile,"rb")
		i = 0
		f.seek(0,2)
		count = f.tell()
		nodes = []
		cache = [b''] * int(count)
		f.seek(0)
		#Calculate character frequency and build a single character into a single node
		while i < count:
		cache[i] = f.read(1)
		if ct_dict.get(cache[i], -1) == -1:
		ct_dict[cache[i]] = 0
		ct_dict[cache[i]] = ct_dict[cache[i]] + 1
		i = i + 1
		print("Read finished")
		print(ct_dict) #Output weight dictionary
		for x in ct_dict.keys():
		nd_dict[x] = node(ct_dict[x])
		nodes.append(nd_dict[x])
		f.close()
		con_tree(nodes) #Construct the Huffman tree
		encode(False) #Construct the encoding table, set Ture to see encoding maps.
		print("Encoding table is OK")
		# Write the compressed binary file
		i = 0
		raw = 0b1
		end = 0
		name = inputfile.split('/')
		o = open(name[0]+".hm" , 'wb')
		o.write((name[len(name)-1] + '\n').encode(encoding="utf-8")) #write the original file name
		o.write(int.to_bytes(len(ec_dict) ,2 ,byteorder = 'big')) #wirte the number of nodes
		for x in ec_dict.keys(): #encode the file header
		o.write(x)
		o.write(int.to_bytes(ct_dict[x] ,3 , byteorder = 'big'))
		while i < count: #Start compressing data
		for x in ec_dict[cache[i]]:
		raw = raw << 1
		if x == 49:
		raw = raw \| 1
		if raw.bit_length() == 9:
		raw = raw & (~(1 << 8))
		o.write(int.to_bytes(raw ,1 , byteorder = 'big'))
		o.flush()
		raw = 0b1
		pro = int(i / count * 100)
		if pro > end:
		print("compressing:", pro ,'%') #Output compressing progress
		end = pro
		i = i + 1
		o.close()
		print("File compress successful.")

		# Define decompressing method for files
		def decompfile(inputfile):
		print("Starting decompress...")
		count = 0
		raw = 0
		end = 0
		f = open(inputfile ,'rb')
		f.seek(0,2)
		eof = f.tell()
		f.seek(0)
		name = inputfile.split('/')
		outputfile = inputfile.replace(name[len(name)-1], f.readline().decode(encoding="utf-8"))
		o = open(outputfile.replace('\n','') ,'wb')
		count = int.from_bytes(f.read(2), byteorder = 'big') #Take out the number of nodes
		i = 0
		de_dict = {}
		while i < count: #Parsing file headers
		key = f.read(1)
		value = int.from_bytes(f.read(3), byteorder = 'big')
		de_dict[key] = value
		i = i + 1
		for x in de_dict.keys():
		nd_dict[x] = node(de_dict[x])
		nodes.append(nd_dict[x])
		con_tree(nodes) #reconstruct Huffman tree
		encode(False) #Create encoding table, set Ture to see encoding maps.
		for x in ec_dict.keys(): #Construct the reverse dictionary
		iv_dict[ec_dict[x]] = x
		i = f.tell()
		data = b''
		while i < eof: #Start decompressing data
		raw = int.from_bytes(f.read(1), byteorder = 'big')
		i = i + 1
		j = 8
		while j > 0:
		if (raw >> (j - 1)) & 1 == 1:
		data = data + b'1'
		raw = raw & (~(1 << (j - 1)))
		else:
		data = data + b'0'
		raw = raw & (~(1 << (j - 1)))
		if iv_dict.get(data, 0) != 0:
		o.write(iv_dict[data])
		o.flush()
		data = b''
		j = j - 1
		pro = int(i / eof * 100)
		if pro > end:
		print("decompressing:", pro,'%') #Output decompression progress
		end = pro
		raw = 0
		f.close()
		o.close()
		print("File decompress successful.")

		#init data
		nd_dict = {}
		ct_dict = {}
		ec_dict = {}
		iv_dict = {}
		nodes = []

		if input("Please choose what you want to do:\n1：Compression\t2：Decompression\n") == '1':
		compfile(input("Please input a file with path for compression:"))
		else:
		decompfile(input("Please input a file with path for decompression:"))
		No newline at end of file

Huffman Project/Huffman.with.verify.head.py

0 → 100644

+193 −0

Original line number	Diff line number	Diff line
		#Define the node class of Huffman tree
		class node(object):

		def __init__(self,value=None,left=None,right=None,father=None):
		self.value = value
		self.left = left
		self.right = right
		self.father = father

		def build_father(left,right):
		n = node(value = left.value + right.value,left = left,right = right)
		left.father = right.father = n
		return n

		def encode(n):
		if n.father == None:
		return b''
		if n.father.left == n:
		return node.encode(n.father) + b'0' #left node encode as '0'
		else:
		return node.encode(n.father) + b'1' #right node encode as '1'

		#Construct the Huffman tree
		def build_tree(tree):
		if len(tree) == 1:
		return tree
		sorts = sorted(tree,key = lambda x:x.value,reverse = False)
		n = node.build_father(sorts[0],sorts[1])
		sorts.pop(0)
		sorts.pop(0)
		sorts.append(n)
		return build_tree(sorts)

		# Output encoding table
		def encode(echo):

		for x in node_dict.keys():
		ec_dict[x] = node.encode(node_dict[x])
		if echo == True:
		print(x)
		print(ec_dict[x])

		def encodefile(inputfile):

		print("Starting encode...")
		f = open(inputfile,"rb")
		bytes_width = 1
		i = 0

		f.seek(0,2)
		count = f.tell() / bytes_width
		print(count)
		nodes = [] #List of nodes, used to build the Huffman tree
		buff = [b''] * int(count)
		f.seek(0)

		#Calculate character frequency and build a single character into a single node
		while i < count:
		buff[i] = f.read(bytes_width)
		if count_dict.get(buff[i], -1) == -1:
		count_dict[buff[i]] = 0
		count_dict[buff[i]] = count_dict[buff[i]] + 1
		i = i + 1
		print("Read OK")
		print(count_dict) #Output weight dictionary
		for x in count_dict.keys():
		node_dict[x] = node(count_dict[x])
		nodes.append(node_dict[x])

		f.close()
		build_tree(nodes) #Construct the Huffman tree
		encode(False) #Construct the encoding table
		print("Encode OK")

		head = sorted(count_dict.items(),key = lambda x:x[1] ,reverse = True) #Sort all root nodes
		bit_width = 1
		print("head:",head[0][1]) #Dynamically adjust the byte width of the encoding table to optimize the file header size
		if head[0][1] > 255:
		bit_width = 2
		if head[0][1] > 65535:
		bit_width = 3
		if head[0][1] > 16777215:
		bit_width = 4
		print("bit_width:",bit_width)
		i = 0
		raw = 0b1
		last = 0
		name = inputfile.split('.')
		o = open(name[0]+".hm" , 'wb')
		name = inputfile.split('/')
		o.write((name[len(name)-1] + '\n').encode(encoding="utf-8")) #write the original file name
		o.write(int.to_bytes(len(ec_dict) ,2 ,byteorder = 'big')) #wirte the number of nodes
		o.write(int.to_bytes(bit_width ,1 ,byteorder = 'big')) #write the encoding table's byte width
		for x in ec_dict.keys(): #encode the file header
		o.write(x)
		o.write(int.to_bytes(count_dict[x] ,bit_width ,byteorder = 'big'))

		print('head OK')
		while i < count: #Start compressing data
		for x in ec_dict[buff[i]]:
		raw = raw << 1
		if x == 49:
		raw = raw \| 1
		if raw.bit_length() == 9:
		raw = raw & (~(1 << 8))
		o.write(int.to_bytes(raw ,1 , byteorder = 'big'))
		o.flush()
		raw = 0b1
		tem = int(i /len(buff) * 100)
		if tem > last:
		print("encode:", tem ,'%') #Output compressing progress
		last = tem
		i = i + 1

		if raw.bit_length() > 1: #Handle data with less than one byte at the end of the file
		raw = raw << (8 - (raw.bit_length() - 1))
		raw = raw & (~(1 << raw.bit_length() - 1))
		o.write(int.to_bytes(raw ,1 , byteorder = 'big'))
		o.close()
		print("File encode successful.")

		def decodefile(inputfile):

		print("Starting decode...")
		count = 0
		raw = 0
		last = 0
		f = open(inputfile ,'rb')
		f.seek(0,2)
		eof = f.tell()
		f.seek(0)
		name = inputfile.split('/')
		outputfile = inputfile.replace(name[len(name)-1], f.readline().decode(encoding="utf-8"))
		o = open(outputfile.replace('\n','') ,'wb')
		count = int.from_bytes(f.read(2), byteorder = 'big') #Take out the number of nodes
		bit_width = int.from_bytes(f.read(1), byteorder = 'big') #Take out the code table word width
		i = 0
		de_dict = {}
		while i < count: #Parsing file headers
		key = f.read(1)
		value = int.from_bytes(f.read(bit_width), byteorder = 'big')
		de_dict[key] = value
		i = i + 1
		for x in de_dict.keys():
		node_dict[x] = node(de_dict[x])
		nodes.append(node_dict[x])
		build_tree(nodes) #reconstruct Huffman tree
		encode(False) #Create encoding table
		for x in ec_dict.keys(): #Construct the reverse dictionary
		inverse_dict[ec_dict[x]] = x
		i = f.tell()
		data = b''
		while i < eof: #Start decompressing data
		raw = int.from_bytes(f.read(1), byteorder = 'big')
		# print("raw:",raw)
		i = i + 1
		j = 8
		while j > 0:
		if (raw >> (j - 1)) & 1 == 1:
		data = data + b'1'
		raw = raw & (~(1 << (j - 1)))
		else:
		data = data + b'0'
		raw = raw & (~(1 << (j - 1)))
		if inverse_dict.get(data, 0) != 0:
		o.write(inverse_dict[data])
		o.flush()
		#print("decode",data,":",inverse_dict[data])
		data = b''
		j = j - 1
		tem = int(i / eof * 100)
		if tem > last:
		print("decode:", tem,'%') #Output decompression progress
		last = tem
		raw = 0

		f.close()
		o.close()
		print("File decode successful.")

		if __name__ == '__main__':

		#init data
		node_dict = {}
		count_dict = {}
		ec_dict = {}
		nodes = []
		inverse_dict = {}

		if input("This is a python program for the Huffman encoding and decoding.\nPlease choose what you want to do:\n1：Compression\t2：Decompression\n") == '1':
		encodefile(input("Please input a file with path to compress:"))
		else:
		decodefile(input("Please input a file with path to decompress:"))
		No newline at end of file

Huffman Project/LZWen.py

0 → 100644

+29 −0

Original line number	Diff line number	Diff line
		# refer https://www.runoob.com/w3cnote/python-lzw.html

		f = open(r'd.md','r')
		string = str(f.readlines())


		dicts = {chr(i):i for i in range(0,127)}

		last = 256
		p = ''
		result = []

		for c in string:
		pc = p+c
		if pc in dicts:
		p = pc
		else:
		result.append(dicts[p])
		dicts[pc] = last
		last += 1
		p = c

		if p != '':
		result.append(dicts[p])

		print(result)

		o= open(r'd.md.lzw','w')
		o.write(str(result))
		No newline at end of file

Admin message