Sunday, 28 October 2018

CODING FPGROWTH IN PYTHON FROM SCRATCH


dats=[['google','amazon',],['amazon','google','python','cse'],['cse','google'],['amazon','python'],

      ['cse','amazon','python','google'],['amazon','google','cse','data',]]

class fptree:

    def __init__(self,ide,cnt,parent):

        self.ide=ide

        self.cnt=cnt

        self.parent=parent

        self.link=None

        self.child={}

    def increm(self,cnt):

        self.cnt+=cnt

def genFi(data,minsp,dic=False):

    kdc={}

    fi,sfi=[],[]

    nnewdata={}

    for dat in data:

        for i in range(0,len(dat)):

            if dat[i] not in kdc:

                kdc[dat[i]]=1

            elif dat[i] in dat[0:i]:

                continue

            else:

                kdc[dat[i]]+=1

    for k,v in kdc.items():

        if v1:

        genTree(data[1:],cnt,null.child[data[0]],kdc)

#nodes with same names but in different paths

def updateNull(alr_pre_node,same_new):

    while(alr_pre_node.link!=None):

        alr_pre_node=alr_pre_node.link

    alr_pre_node.link=same_new

def gen_cond_pattern_bases(node):

    patterns={}

    while node!=None:

        prefix=[]

        bottom_up(node,prefix)

        if len(prefix)>1:

            patterns[tuple(prefix[1:])]=node.cnt

        node=node.link

    return patterns

def bottom_up(node,prefix):

    if node.parent!=None:

        prefix.append(node.ide)

        bottom_up(node.parent,prefix)

def cond_tree(null,kdc,minsup,prefix,freq_items,sup={},sing_sup=0):

    List=[v[0] for v in sorted(kdc.items(),key=lambda p:p[1])]

    for it in List:

        new_freq_items=prefix.copy()

        dic={}

        dic[it]=0

        for k,v in sup.items():

            if it in k:

                dic[it]+=v

        new_freq_items.add(it)

        if dic[it]!=0:

            freq_items.append((new_freq_items,dic[it]))

        else:

            freq_items.append((new_freq_items,sing_sup[it][0]))

        patterns=gen_cond_pattern_bases(kdc[it][1])

        da=[]

        for k,v in patterns.items():

            for i in range(0,v):

                da.append(list(k))

        new_null,new_kdc=genFi(da,2)

        if new_null!=None:

            cond_tree(new_null,new_kdc,minsup,new_freq_items,freq_items,patterns,sing_sup)

htable=genFi(dats,2)

freq_itemss=[]

cond_tree(htable[0],htable[1],2,set([]),freq_itemss,{},htable[1])

freq_itemss

data=pd.DataFrame(columns=["items","support"])

its=[]

sps=[]

for i in range(0,len(freq_itemss)):

    its.append(list(freq_itemss[i][0]))

    sps.append(freq_itemss[i][1])

data["items"]=its

data["support"]=sps

data.sort_values(ascending=False,by="support")



OUTPUT:

                            items  support
12                       [google]        5
14                       [amazon]        5
8                           [cse]        4
11                  [cse, google]        4
13               [amazon, google]        4
0                        [python]        3
7                [python, amazon]        3
9                   [cse, amazon]        3
10          [cse, amazon, google]        3
1                   [python, cse]        2
2           [python, cse, amazon]        2
3                [python, google]        2
4        [python, amazon, google]        2
5   [python, cse, amazon, google]        2
6           [python, cse, google]        2







Monday, 8 October 2018

APRIORI IMPLEMENTATION FROM SCRATCH

from itertools import combinations
import pandas as pd
def pruned(it,k,l):
    subs=combinations(sorted(it),k)
    for itm in subs:
        if itm not in l:
            return False
    return True
def apriori_gen(c,data,minsp,k,l):
    C=[]
    for i in range(0,len(c)):
        itm=[]
        for j in range(i+1,len(c)):
            if c[i][0:(k-1)]==c[j][0:(k-1)]:
                if type(c[i][0:(k-1)])!=tuple:
                    itm=(c[i][0:(k-1)])+(c[i][(k-1)])+(c[j][(k-1)])
                else:
                    itm=c[i][0:(k-1)]+(c[i][(k-1)],)+(c[j][(k-1)],)
                itm=tuple(sorted(itm))
            if pruned(itm,k,l)!=False and itm not in C:
                C.append(itm)

    return C        
def apriori(data,minsp):
    kdc={}
    fi,sfi=[],[]
    for dat in data:
        for i in range(0,len(dat)):
            if dat[i] not in kdc:
                kdc[dat[i]]=1
            elif dat[i] in dat[0:i]:
                continue
            else:
                kdc[dat[i]]+=1
    for k,v in kdc.items():
        if v>=minsp:
            fi.append((k,v))
            sfi.append(k)
    sfi=sorted(sfi)
    for i in range(len(data)):
        data[i]=tuple(set(sfi).intersection(set(data[i])))
    k=2
    l=[]
    c=[]
    c=combinations(sfi,2)
    for it in c:
        l.append(it)
    c=l
    dic={}
    while len(l)!=0:
        if k>2:
            c=apriori_gen(l,dats,minsp,k-1,l)
        for it in data:
            sub=combinations(it,k)
            subs=[]
            for itms in sub:
                subs.append(tuple(sorted(itms)))
            for itm in c:
                itm=tuple(sorted(itm))
                if itm in subs:
                    #print(itm,"----",subs)
                    if itm not in dic:
                        dic[itm]=1
                    else:
                        dic[itm]+=1
        l=[]
        for key,v in dic.items():
            if v>=minsp:
                fi.append((key,v))
                l.append(key)
        dic={}
        k=k+1
    freq=fi
    data=pd.DataFrame(columns=["items","support"])
    its=[]
    sps=[]
    for i in range(0,len(freq)):
        its.append(str(freq[i][0]))
        sps.append(freq[i][1])
    data["items"]=its
    data["support"]=sps
    return data
--------------------------------------------------------------------
import requests as rq
res=rq.get(r"https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/groceries.csv")
data=[]
for it in res.iter_lines(1000000):
    li=[]
    for itm in it.split(b','):
        li.append(str(itm))
    data.append(li)

apriori(data[0:500],2).sort_values("support",ascending=False)

Sunday, 2 September 2018

COPE COPY

#! /bin/sh
xsel -bc
path="~"
fil=""
filname=""
bool=0
echo "enter the output directory path:"
read path
if [ ! -e "$path" ]; then
mkdir $path
fi
echo "do you want to compress the folder:[y\n]"
read bool
function trap_ctrlc()
{
zpath=$(echo $path | rev | cut -d"/" -f1 | rev)
if [ "$bool" =  "y" ] || [  "$bool" = "Y" ]; then
zip -r $zpath $path
fi
exit 2
}
trap "trap_ctrlc" 2
while :
do
fil=$(xclip -selection clipboard -o)
filname=$(echo $fil | rev | cut -d"/" -f1 | rev)
if [ -e "$path/$filname" ]; then
echo "file already exists"
else
cp -R $fil $path
echo $fil "copied to" $path
fi
xsel -bc
sleep 1
done



Overview

The process of copying the files or folders from different locations into one particular location at a time is tedious when there is a need for copying many files from many folders.This project makes this task plain and easy by accessing the clipboard contents through shell script and using the linux commands to copy the files.

Goals

  • The main goal of this project is minimizing the tiredness involved in copying the files at different locations each and every time and pasting into the target folder.
  • The goal is to run the shell script in the terminal and proceeding normally by copying the files either through pressing (CTRL +C) or using Graphical User Interface (GUI) and the executing script will copy the files/folders.
  • Ultimately, there is an option to compress the target folder which can be further used for mailing all the copied files easily.

Specifications

  • The project is written in shell script (GNU bash version 4.4.23).
  • The shell script is run on Kali GNU/Linux Rolling operating system.
  • The shell script is run on linux kernel (Linux 4.12.0-kali1-amd64).
  • Basic linux commands were used in the code.
  • Linux packages like xclip,xsel etc were used to make tasks efficient.
Working of shell script(program):
     The program initially clears the clipboard using the xsel command [1].The program then asks to enter the target directory path [7].If the path entered doesn’t exist then by default it creates the target folder with given path [10].The program asks whether to compress the target folder after copying all files into that [12].

    Then the program enters the while loop and and check for the content in the clipboard.Whenever we press (CTRL+C)  on the file/folder to copy it then it stores the files path into the clipboard and the program access that clipboard via xclip package and commands in it [25].Then it checks whether if file is present in the target path given or not.If it is already present then it skips copying otherwise it copies the file into the folder [27-32].Then the clipboard contents are cleared so that we can copy other file and sleeps for one second so as to complete the task and returns to while loop [28-29].

   The while loop ends when we press (CTRL+C) in the terminal and (CTRL+C ) is handled by trap command to check for whether to compress the folder [22].So it returns to function and based on the user requirement the function either compresses or leave the folder in the path [14-21].

Note:[i] refers to the line number in the code.

Tuesday, 24 July 2018

LOGISTIC REGRESSION FROM SCRATCH,PYTHON

This is classification algorithm:
example:
        suppose, you have given the features such as (length of hair,type of dress,age,weight,height,etc) of a person and you have to predict whether
person is male or female.so you need to classify as male or female.

LOGISTIC MODEL:
- so to fit this model linear regression could is useless not exactly but the line cannot fit this
  distribution
- we take an initution from linear regression to build logistic regression.
- In LR we actually predicting the value when it comes to logistic regression you are classifying       (male 1 or female 0) it is Binary Classification.
- so linear model could not predict beacuse it can predict beyond 1 and below 0
- so sigmoid function will restrict this value between 0 and 1 and and values above 0.5 from sigmoid
  function are classified as 1 and below 0.5 as 0
                                       
**read more about logistic regression here:
PYTHON PROGRAM :
import csv
import numpy as np
import matplotlib.pyplot as plt
def loadCSV(fn):
    with open(fn,'r') as csf:
        lines=csv.reader(csf)
        data=list(lines)
        for i in range(len(data)):
            data[i]=[float(x) for x in data[i]]
    return np.array(data)
In [2]:
           
def norm(X):
    mi=np.min(X,axis=0)
    mx=np.max(X,axis=0)
    rng=mx-mi
    norm_X=1-((mx-X)/rng)
    return norm_X
def logfn(theta,X):
    return 1/(1+np.exp(-np.dot(X,theta.T)))
def log_grad(theta,X,y):
    fc=logfn(theta,X)-y.reshape(X.shape[0],-1)
    fl=np.dot(fc.T,X)
    return fl   
def cost(theta,X,y):
    log_fn=logfn(theta,X)
    y=np.squeeze(y)
    s1=y*np.log(log_fn)
    s2=(1-y)*np.log(1-log_fn)
    fi=-(s1+s2)
    return np.mean(fi)
def grad_desc(X,y,theta,lr=0.05,conv_change=0.001):
    cos=cost(theta,X,y)
    chgcos=1
    noi=1
    while chgcos>conv_change and noi<500:
        oldcos=cos
        theta=theta-(lr*log_grad(theta,X,y))/len(y)
        cos=cost(theta,X,y)
        chgcos=oldcos-cos
        noi+=1
    return theta,noi
def pred(theta,X):
    prob=logfn(theta,X)
    value=np.where(prob>=0.5,1,0)
    return np.squeeze(value)
data=loadCSV('../input/logistic.csv')
X=norm(data[:,:-1])
X=np.hstack((np.matrix(np.ones(X.shape[0])).T,X))
y=data[:,-1]
theta=np.matrix(np.zeros(X.shape[1]))
theta,noi=grad_desc(X,y,theta)
print("estimated regression coefficients:",theta)
print("no of iterations:",noi)
ypred=pred(theta,X)
print("correctly predicted labels",np.sum(y==ypred))
estimated regression coefficients: [[ 0.2297094   1.20762038 -1.8150657 ]]
no of iterations: 500
correctly predicted labels 100

LINEAR REGRESSION FROM SCRATCH,PYTHON

This algorithm is basic one to dive into machine learning.
Linear models for regression:
- y=mX+c is the linear regression that fits the plots.
- Bascially, it reduces the distance between the line and all data points and the best 
  m and c values are found using optmisation algorithm.
- The optmisation algorithm used here is Gradient Descent usually reduces cost
- y is the output(the value to be predicted) and X is the feautures(predictors)
- y is dependent variable on variables of X
The optimisation algorithm used is GRADIENT DESCENT
PYTHON PROGRAM:

from numpy import *
ptsp=[]
theta1=0
theta2=0
thetal=[]
def predict(x):
    y= 1.322*x + 7.991
    return y
def error(b,m,pts):
    tr=0
    for i in range(0,len(pts)):
        x=pts[i,0]
        y=pts[i,1]
        tr+=(y-(m*x+b))**2
    return tr/float(len(pts))
def run():
    global theta1
    global theta2
    global ptsp
    pts=genfromtxt("../input/data.csv",delimiter=",")
    ptsp=pts
    import numpy as np
    xx=ptsp[:,0]
    yy=ptsp[:,1]
    mask=[]
    for i in range(0,len(xx)):
        if xx[i]<=35 or xx[i]>=65:
            mask.append(i)
    xx=np.delete(xx,mask)
    yy=np.delete(yy,mask)
    mask=[]
    for i in range(0,len(yy)):
        if yy[i]<=45 or yy[i]>=100:
            mask.append(i)
    xx=np.delete(xx,mask)
    yy=np.delete(yy,mask)
    ptsp=[[k,v] for k,v in zip(xx,yy)]
    ptsp=np.array(ptsp)
    pts=ptsp
    lr=0.0001
    ib=0
    im=0
    noi=100000
    print("starting gradient descent at b={0},m={1},error={2}".format(ib,im,error(ib,im,pts)))
    print("running....")
    b,m=gradesc_runner(pts,ib,im,lr,noi)
    print("after {0} iterations ,b={1},m={2},error={3}".format(noi,b,m,error(b,m,pts)))
    theta1=m
    theta2=b
def gradesc_runner(pts,ib,im,lr,noi):
    b=ib
    m=im
    global thetal
    for i in range(noi):
        b,m=gradesc(b,m,array(pts),lr)
        tp=[b,m,error(b,m,pts)]
        thetal.append(tp)
    return [b,m]
def gradesc(b,m,pts,lr):
    bg=0
    mg=0
    N=float(len(pts))
    for i in range(0,len(pts)):
        x=pts[i,0]
        y=pts[i,1]
        bg+=-(2/N)*(y-((m*x)+b))
        mg+=-(2/N)*x*(y-((m*x)+b))
    b=b-(lr*bg)
    m=m-(lr*mg)
    return b,m
run()
starting gradient descent at b=0,m=0,error=5565.947156039996
running....
after 100000 iterations ,b=6.232803360817703,m=1.3444468870190596,error=94.6469921800091
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(ptsp[:,0],ptsp[:,1],c='r')
plt.plot(ptsp[:,0],[predict(k) for k in ptsp[:,0]])
Out[2]:
[<matplotlib.lines.Line2D at 0x7f656c7b4ba8>]

CODING FPGROWTH IN PYTHON FROM SCRATCH

dats=[['google','amazon',],['amazon','google','python','cse'],['cse','google...