Sunday 28 October 2018

CODING FPGROWTH IN PYTHON FROM SCRATCH


dats=[['google','amazon',],['amazon','google','python','cse'],['cse','google'],['amazon','python'],

      ['cse','amazon','python','google'],['amazon','google','cse','data',]]

class fptree:

    def __init__(self,ide,cnt,parent):

        self.ide=ide

        self.cnt=cnt

        self.parent=parent

        self.link=None

        self.child={}

    def increm(self,cnt):

        self.cnt+=cnt

def genFi(data,minsp,dic=False):

    kdc={}

    fi,sfi=[],[]

    nnewdata={}

    for dat in data:

        for i in range(0,len(dat)):

            if dat[i] not in kdc:

                kdc[dat[i]]=1

            elif dat[i] in dat[0:i]:

                continue

            else:

                kdc[dat[i]]+=1

    for k,v in kdc.items():

        if v1:

        genTree(data[1:],cnt,null.child[data[0]],kdc)

#nodes with same names but in different paths

def updateNull(alr_pre_node,same_new):

    while(alr_pre_node.link!=None):

        alr_pre_node=alr_pre_node.link

    alr_pre_node.link=same_new

def gen_cond_pattern_bases(node):

    patterns={}

    while node!=None:

        prefix=[]

        bottom_up(node,prefix)

        if len(prefix)>1:

            patterns[tuple(prefix[1:])]=node.cnt

        node=node.link

    return patterns

def bottom_up(node,prefix):

    if node.parent!=None:

        prefix.append(node.ide)

        bottom_up(node.parent,prefix)

def cond_tree(null,kdc,minsup,prefix,freq_items,sup={},sing_sup=0):

    List=[v[0] for v in sorted(kdc.items(),key=lambda p:p[1])]

    for it in List:

        new_freq_items=prefix.copy()

        dic={}

        dic[it]=0

        for k,v in sup.items():

            if it in k:

                dic[it]+=v

        new_freq_items.add(it)

        if dic[it]!=0:

            freq_items.append((new_freq_items,dic[it]))

        else:

            freq_items.append((new_freq_items,sing_sup[it][0]))

        patterns=gen_cond_pattern_bases(kdc[it][1])

        da=[]

        for k,v in patterns.items():

            for i in range(0,v):

                da.append(list(k))

        new_null,new_kdc=genFi(da,2)

        if new_null!=None:

            cond_tree(new_null,new_kdc,minsup,new_freq_items,freq_items,patterns,sing_sup)

htable=genFi(dats,2)

freq_itemss=[]

cond_tree(htable[0],htable[1],2,set([]),freq_itemss,{},htable[1])

freq_itemss

data=pd.DataFrame(columns=["items","support"])

its=[]

sps=[]

for i in range(0,len(freq_itemss)):

    its.append(list(freq_itemss[i][0]))

    sps.append(freq_itemss[i][1])

data["items"]=its

data["support"]=sps

data.sort_values(ascending=False,by="support")



OUTPUT:

                            items  support
12                       [google]        5
14                       [amazon]        5
8                           [cse]        4
11                  [cse, google]        4
13               [amazon, google]        4
0                        [python]        3
7                [python, amazon]        3
9                   [cse, amazon]        3
10          [cse, amazon, google]        3
1                   [python, cse]        2
2           [python, cse, amazon]        2
3                [python, google]        2
4        [python, amazon, google]        2
5   [python, cse, amazon, google]        2
6           [python, cse, google]        2







Monday 8 October 2018

APRIORI IMPLEMENTATION FROM SCRATCH

from itertools import combinations
import pandas as pd
def pruned(it,k,l):
    subs=combinations(sorted(it),k)
    for itm in subs:
        if itm not in l:
            return False
    return True
def apriori_gen(c,data,minsp,k,l):
    C=[]
    for i in range(0,len(c)):
        itm=[]
        for j in range(i+1,len(c)):
            if c[i][0:(k-1)]==c[j][0:(k-1)]:
                if type(c[i][0:(k-1)])!=tuple:
                    itm=(c[i][0:(k-1)])+(c[i][(k-1)])+(c[j][(k-1)])
                else:
                    itm=c[i][0:(k-1)]+(c[i][(k-1)],)+(c[j][(k-1)],)
                itm=tuple(sorted(itm))
            if pruned(itm,k,l)!=False and itm not in C:
                C.append(itm)

    return C        
def apriori(data,minsp):
    kdc={}
    fi,sfi=[],[]
    for dat in data:
        for i in range(0,len(dat)):
            if dat[i] not in kdc:
                kdc[dat[i]]=1
            elif dat[i] in dat[0:i]:
                continue
            else:
                kdc[dat[i]]+=1
    for k,v in kdc.items():
        if v>=minsp:
            fi.append((k,v))
            sfi.append(k)
    sfi=sorted(sfi)
    for i in range(len(data)):
        data[i]=tuple(set(sfi).intersection(set(data[i])))
    k=2
    l=[]
    c=[]
    c=combinations(sfi,2)
    for it in c:
        l.append(it)
    c=l
    dic={}
    while len(l)!=0:
        if k>2:
            c=apriori_gen(l,dats,minsp,k-1,l)
        for it in data:
            sub=combinations(it,k)
            subs=[]
            for itms in sub:
                subs.append(tuple(sorted(itms)))
            for itm in c:
                itm=tuple(sorted(itm))
                if itm in subs:
                    #print(itm,"----",subs)
                    if itm not in dic:
                        dic[itm]=1
                    else:
                        dic[itm]+=1
        l=[]
        for key,v in dic.items():
            if v>=minsp:
                fi.append((key,v))
                l.append(key)
        dic={}
        k=k+1
    freq=fi
    data=pd.DataFrame(columns=["items","support"])
    its=[]
    sps=[]
    for i in range(0,len(freq)):
        its.append(str(freq[i][0]))
        sps.append(freq[i][1])
    data["items"]=its
    data["support"]=sps
    return data
--------------------------------------------------------------------
import requests as rq
res=rq.get(r"https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/groceries.csv")
data=[]
for it in res.iter_lines(1000000):
    li=[]
    for itm in it.split(b','):
        li.append(str(itm))
    data.append(li)

apriori(data[0:500],2).sort_values("support",ascending=False)

CODING FPGROWTH IN PYTHON FROM SCRATCH

dats=[['google','amazon',],['amazon','google','python','cse'],['cse','google&#...